[𝘀𝗽𝗿] changes introduced through rebaseupstream/users/ilovepi/spr/main.llvmnfc-document-clopt-variable-and-fix-typo-1

Created using spr 1.3.4 [skip ci]
author: Paul Kirth <paulkirth@google.com> 2024-04-29 18:27:20 +0000
committer: Paul Kirth <paulkirth@google.com> 2024-04-29 18:27:20 +0000
commit: f53ab85214a1facd092ce97b03f83c43a959f979 (patch)
tree: 7d38bbfc50bc9fdd6d488c1d719b280627b65fb5
parent: 4bdaf60f8fb08b0088b565c6818f967ac75a5400 (diff)
parent: 9c3f5fe88f19820360981d0798392799e1924cb7 (diff)
1141 files changed, 34558 insertions, 27452 deletions
diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge
index 81e9246de9b5..78a9cb77ff7d 100755
--- a/.ci/generate-buildkite-pipeline-premerge
+++ b/.ci/generate-buildkite-pipeline-premerge
@@ -91,7 +91,7 @@ function add-dependencies() {
     echo "${project}"
     case ${project} in
     bolt)
-      for p in lld llvm; do
+      for p in clang lld llvm; do
         echo $p
       done
     ;;
diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh
index b347c443da67..b00a4b984a1d 100755
--- a/.ci/monolithic-linux.sh
+++ b/.ci/monolithic-linux.sh
@@ -48,7 +48,6 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -D LLVM_LIT_ARGS="-v --xunit-xml-output ${BUILD_DIR}/test-results.xml --timeout=1200 --time-tests" \
       -D LLVM_ENABLE_LLD=ON \
       -D CMAKE_CXX_FLAGS=-gmlt \
-      -D BOLT_CLANG_EXE=/usr/bin/clang \
       -D LLVM_CCACHE_BUILD=ON \
       -D MLIR_ENABLE_BINDINGS_PYTHON=ON
 
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 45da8af51bb9..0f178df1d18f 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -35,6 +35,10 @@
 clang/lib/AST/Interp/ @tbaederr
 clang/test/AST/Interp/ @tbaederr
 
+/clang/include/clang/CIR @lanza @bcardosolopes
+/clang/lib/CIR @lanza @bcardosolopes
+/clang/tools/cir-* @lanza @bcardosolopes
+
 /lldb/ @JDevlieghere
 
 # MLIR Interfaces.
diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h
index 2561468a0f99..d37e6f5ed859 100644
--- a/bolt/include/bolt/Rewrite/RewriteInstance.h
+++ b/bolt/include/bolt/Rewrite/RewriteInstance.h
@@ -424,6 +424,7 @@ private:
 
   /// Common section names.
   static StringRef getEHFrameSectionName() { return ".eh_frame"; }
+  static StringRef getEHFrameHdrSectionName() { return ".eh_frame_hdr"; }
   static StringRef getRelaDynSectionName() { return ".rela.dyn"; }
 
   /// FILE symbol name used for local fragments of global functions.
@@ -493,6 +494,9 @@ private:
   /// Store all non-zero symbols in this map for a quick address lookup.
   std::map<uint64_t, llvm::object::SymbolRef> FileSymRefs;
 
+  /// FILE symbols used for disambiguating split function parents.
+  std::vector<ELFSymbolRef> FileSymbols;
+
   std::unique_ptr<DWARFRewriter> DebugInfoRewriter;
 
   std::unique_ptr<BoltAddressTranslation> BAT;
diff --git a/bolt/include/bolt/Utils/NameResolver.h b/bolt/include/bolt/Utils/NameResolver.h
index 2e3ac20a532d..ccffa5633245 100644
--- a/bolt/include/bolt/Utils/NameResolver.h
+++ b/bolt/include/bolt/Utils/NameResolver.h
@@ -28,10 +28,23 @@ class NameResolver {
   static constexpr char Sep = '/';
 
 public:
-  /// Return unique version of the \p Name in the form "Name<Sep><Number>".
+  /// Return the number of uniquified versions of a given \p Name.
+  uint64_t getUniquifiedNameCount(StringRef Name) const {
+    if (Counters.contains(Name))
+      return Counters.at(Name);
+    return 0;
+  }
+
+  /// Return unique version of the \p Name in the form "Name<Sep><ID>".
+  std::string getUniqueName(StringRef Name, const uint64_t ID) const {
+    return (Name + Twine(Sep) + Twine(ID)).str();
+  }
+
+  /// Register new version of \p Name and return unique version in the form
+  /// "Name<Sep><Number>".
   std::string uniquify(StringRef Name) {
     const uint64_t ID = ++Counters[Name];
-    return (Name + Twine(Sep) + Twine(ID)).str();
+    return getUniqueName(Name, ID);
   }
 
   /// For uniquified \p Name, return the original form (that may no longer be
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index a6b2f3cc0850..329649c1ca67 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -840,6 +840,7 @@ void RewriteInstance::discoverFileObjects() {
       continue;
 
     if (cantFail(Symbol.getType()) == SymbolRef::ST_File) {
+      FileSymbols.emplace_back(Symbol);
       StringRef Name =
           cantFail(std::move(NameOrError), "cannot get symbol name for file");
       // Ignore Clang LTO artificial FILE symbol as it is not always generated,
@@ -1340,6 +1341,7 @@ void RewriteInstance::discoverFileObjects() {
   }
 
   registerFragments();
+  FileSymbols.clear();
 }
 
 Error RewriteInstance::discoverRtFiniAddress() {
@@ -1417,50 +1419,116 @@ void RewriteInstance::registerFragments() {
   if (!BC->HasSplitFunctions)
     return;
 
+  // Process fragments with ambiguous parents separately as they are typically a
+  // vanishing minority of cases and require expensive symbol table lookups.
+  std::vector<std::pair<StringRef, BinaryFunction *>> AmbiguousFragments;
   for (auto &BFI : BC->getBinaryFunctions()) {
     BinaryFunction &Function = BFI.second;
     if (!Function.isFragment())
       continue;
-    unsigned ParentsFound = 0;
     for (StringRef Name : Function.getNames()) {
-      StringRef BaseName, Suffix;
-      std::tie(BaseName, Suffix) = Name.split('/');
+      StringRef BaseName = NR.restore(Name);
+      const bool IsGlobal = BaseName == Name;
       const size_t ColdSuffixPos = BaseName.find(".cold");
       if (ColdSuffixPos == StringRef::npos)
         continue;
-      // For cold function with local (foo.cold/1) symbol, prefer a parent with
-      // local symbol as well (foo/1) over global symbol (foo).
-      std::string ParentName = BaseName.substr(0, ColdSuffixPos).str();
+      StringRef ParentName = BaseName.substr(0, ColdSuffixPos);
       const BinaryData *BD = BC->getBinaryDataByName(ParentName);
-      if (Suffix != "") {
-        ParentName.append(Twine("/", Suffix).str());
-        const BinaryData *BDLocal = BC->getBinaryDataByName(ParentName);
-        if (BDLocal || !BD)
-          BD = BDLocal;
-      }
-      if (!BD) {
-        if (opts::Verbosity >= 1)
-          BC->outs() << "BOLT-INFO: parent function not found for " << Name
-                     << "\n";
+      const uint64_t NumPossibleLocalParents =
+          NR.getUniquifiedNameCount(ParentName);
+      // The most common case: single local parent fragment.
+      if (!BD && NumPossibleLocalParents == 1) {
+        BD = BC->getBinaryDataByName(NR.getUniqueName(ParentName, 1));
+      } else if (BD && (!NumPossibleLocalParents || IsGlobal)) {
+        // Global parent and either no local candidates (second most common), or
+        // the fragment is global as well (uncommon).
+      } else {
+        // Any other case: need to disambiguate using FILE symbols.
+        AmbiguousFragments.emplace_back(ParentName, &Function);
         continue;
       }
-      const uint64_t Address = BD->getAddress();
-      BinaryFunction *BF = BC->getBinaryFunctionAtAddress(Address);
-      if (!BF) {
-        if (opts::Verbosity >= 1)
-          BC->outs() << formatv(
-              "BOLT-INFO: parent function not found at {0:x}\n", Address);
-        continue;
+      if (BD) {
+        BinaryFunction *BF = BC->getFunctionForSymbol(BD->getSymbol());
+        if (BF) {
+          BC->registerFragment(Function, *BF);
+          continue;
+        }
       }
-      BC->registerFragment(Function, *BF);
-      ++ParentsFound;
-    }
-    if (!ParentsFound) {
       BC->errs() << "BOLT-ERROR: parent function not found for " << Function
                  << '\n';
       exit(1);
     }
   }
+
+  if (AmbiguousFragments.empty())
+    return;
+
+  if (!BC->hasSymbolsWithFileName()) {
+    BC->errs() << "BOLT-ERROR: input file has split functions but does not "
+                  "have FILE symbols. If the binary was stripped, preserve "
+                  "FILE symbols with --keep-file-symbols strip option";
+    exit(1);
+  }
+
+  // The first global symbol is identified by the symbol table sh_info value.
+  // Used as local symbol search stopping point.
+  auto *ELF64LEFile = cast<ELF64LEObjectFile>(InputFile);
+  const ELFFile<ELF64LE> &Obj = ELF64LEFile->getELFFile();
+  auto *SymTab = llvm::find_if(cantFail(Obj.sections()), [](const auto &Sec) {
+    return Sec.sh_type == ELF::SHT_SYMTAB;
+  });
+  assert(SymTab);
+  // Symtab sh_info contains the value one greater than the symbol table index
+  // of the last local symbol.
+  ELFSymbolRef LocalSymEnd = ELF64LEFile->toSymbolRef(SymTab, SymTab->sh_info);
+
+  for (auto &[ParentName, BF] : AmbiguousFragments) {
+    const uint64_t Address = BF->getAddress();
+
+    // Get fragment's own symbol
+    const auto SymIt = FileSymRefs.find(Address);
+    if (SymIt == FileSymRefs.end()) {
+      BC->errs()
+          << "BOLT-ERROR: symbol lookup failed for function at address 0x"
+          << Twine::utohexstr(Address) << '\n';
+      exit(1);
+    }
+
+    // Find containing FILE symbol
+    ELFSymbolRef Symbol = SymIt->second;
+    auto FSI = llvm::upper_bound(FileSymbols, Symbol);
+    if (FSI == FileSymbols.begin()) {
+      BC->errs() << "BOLT-ERROR: owning FILE symbol not found for symbol "
+                 << cantFail(Symbol.getName()) << '\n';
+      exit(1);
+    }
+
+    ELFSymbolRef StopSymbol = LocalSymEnd;
+    if (FSI != FileSymbols.end())
+      StopSymbol = *FSI;
+
+    uint64_t ParentAddress{0};
+    // Iterate over local file symbols and check symbol names to match parent.
+    for (ELFSymbolRef Symbol(FSI[-1]); Symbol < StopSymbol; Symbol.moveNext()) {
+      if (cantFail(Symbol.getName()) == ParentName) {
+        ParentAddress = cantFail(Symbol.getAddress());
+        break;
+      }
+    }
+
+    // No local parent is found, use global parent function.
+    if (!ParentAddress)
+      if (BinaryData *ParentBD = BC->getBinaryDataByName(ParentName))
+        ParentAddress = ParentBD->getAddress();
+
+    if (BinaryFunction *ParentBF =
+            BC->getBinaryFunctionAtAddress(ParentAddress)) {
+      BC->registerFragment(*BF, *ParentBF);
+      continue;
+    }
+    BC->errs() << "BOLT-ERROR: parent function not found for " << *BF << '\n';
+    exit(1);
+  }
 }
 
 void RewriteInstance::createPLTBinaryFunction(uint64_t TargetAddress,
@@ -1725,12 +1793,6 @@ void RewriteInstance::adjustFunctionBoundaries() {
       if (!Function.isSymbolValidInScope(Symbol, SymbolSize))
         break;
 
-      // Ignore unnamed symbols. Used, for example, by debugging info on RISC-V.
-      if (BC->isRISCV() && cantFail(Symbol.getName()).empty()) {
-        ++NextSymRefI;
-        continue;
-      }
-
       // Skip basic block labels. This happens on RISC-V with linker relaxation
       // enabled because every branch needs a relocation and corresponding
       // symbol. We don't want to add such symbols as entry points.
@@ -3932,11 +3994,6 @@ void RewriteInstance::patchELFPHDRTable() {
 
   OS.seek(PHDRTableOffset);
 
-  bool ModdedGnuStack = false;
-  (void)ModdedGnuStack;
-  bool AddedSegment = false;
-  (void)AddedSegment;
-
   auto createNewTextPhdr = [&]() {
     ELF64LEPhdrTy NewPhdr;
     NewPhdr.p_type = ELF::PT_LOAD;
@@ -3952,40 +4009,53 @@ void RewriteInstance::patchELFPHDRTable() {
     NewPhdr.p_filesz = NewTextSegmentSize;
     NewPhdr.p_memsz = NewTextSegmentSize;
     NewPhdr.p_flags = ELF::PF_X | ELF::PF_R;
-    // FIXME: Currently instrumentation is experimental and the runtime data
-    // is emitted with code, thus everything needs to be writable
-    if (opts::Instrument)
+    if (opts::Instrument) {
+      // FIXME: Currently instrumentation is experimental and the runtime data
+      // is emitted with code, thus everything needs to be writable.
       NewPhdr.p_flags |= ELF::PF_W;
+    }
     NewPhdr.p_align = BC->PageAlign;
 
     return NewPhdr;
   };
 
-  auto createNewWritableSectionsPhdr = [&]() {
-    ELF64LEPhdrTy NewPhdr;
-    NewPhdr.p_type = ELF::PT_LOAD;
-    NewPhdr.p_offset = getFileOffsetForAddress(NewWritableSegmentAddress);
-    NewPhdr.p_vaddr = NewWritableSegmentAddress;
-    NewPhdr.p_paddr = NewWritableSegmentAddress;
-    NewPhdr.p_filesz = NewWritableSegmentSize;
-    NewPhdr.p_memsz = NewWritableSegmentSize;
-    NewPhdr.p_align = BC->RegularPageSize;
-    NewPhdr.p_flags = ELF::PF_R | ELF::PF_W;
-    return NewPhdr;
+  auto writeNewSegmentPhdrs = [&]() {
+    ELF64LE::Phdr NewTextPhdr = createNewTextPhdr();
+    OS.write(reinterpret_cast<const char *>(&NewTextPhdr), sizeof(NewTextPhdr));
+
+    if (NewWritableSegmentSize) {
+      ELF64LEPhdrTy NewPhdr;
+      NewPhdr.p_type = ELF::PT_LOAD;
+      NewPhdr.p_offset = getFileOffsetForAddress(NewWritableSegmentAddress);
+      NewPhdr.p_vaddr = NewWritableSegmentAddress;
+      NewPhdr.p_paddr = NewWritableSegmentAddress;
+      NewPhdr.p_filesz = NewWritableSegmentSize;
+      NewPhdr.p_memsz = NewWritableSegmentSize;
+      NewPhdr.p_align = BC->RegularPageSize;
+      NewPhdr.p_flags = ELF::PF_R | ELF::PF_W;
+      OS.write(reinterpret_cast<const char *>(&NewPhdr), sizeof(NewPhdr));
+    }
   };
 
+  bool ModdedGnuStack = false;
+  bool AddedSegment = false;
+
   // Copy existing program headers with modifications.
   for (const ELF64LE::Phdr &Phdr : cantFail(Obj.program_headers())) {
     ELF64LE::Phdr NewPhdr = Phdr;
-    if (PHDRTableAddress && Phdr.p_type == ELF::PT_PHDR) {
-      NewPhdr.p_offset = PHDRTableOffset;
-      NewPhdr.p_vaddr = PHDRTableAddress;
-      NewPhdr.p_paddr = PHDRTableAddress;
-      NewPhdr.p_filesz = sizeof(NewPhdr) * Phnum;
-      NewPhdr.p_memsz = sizeof(NewPhdr) * Phnum;
-    } else if (Phdr.p_type == ELF::PT_GNU_EH_FRAME) {
-      ErrorOr<BinarySection &> EHFrameHdrSec =
-          BC->getUniqueSectionByName(getNewSecPrefix() + ".eh_frame_hdr");
+    switch (Phdr.p_type) {
+    case ELF::PT_PHDR:
+      if (PHDRTableAddress) {
+        NewPhdr.p_offset = PHDRTableOffset;
+        NewPhdr.p_vaddr = PHDRTableAddress;
+        NewPhdr.p_paddr = PHDRTableAddress;
+        NewPhdr.p_filesz = sizeof(NewPhdr) * Phnum;
+        NewPhdr.p_memsz = sizeof(NewPhdr) * Phnum;
+      }
+      break;
+    case ELF::PT_GNU_EH_FRAME: {
+      ErrorOr<BinarySection &> EHFrameHdrSec = BC->getUniqueSectionByName(
+          getNewSecPrefix() + getEHFrameHdrSectionName());
       if (EHFrameHdrSec && EHFrameHdrSec->isAllocatable() &&
           EHFrameHdrSec->isFinalized()) {
         NewPhdr.p_offset = EHFrameHdrSec->getOutputFileOffset();
@@ -3994,37 +4064,36 @@ void RewriteInstance::patchELFPHDRTable() {
         NewPhdr.p_filesz = EHFrameHdrSec->getOutputSize();
         NewPhdr.p_memsz = EHFrameHdrSec->getOutputSize();
       }
-    } else if (opts::UseGnuStack && Phdr.p_type == ELF::PT_GNU_STACK) {
-      NewPhdr = createNewTextPhdr();
-      ModdedGnuStack = true;
-    } else if (!opts::UseGnuStack && Phdr.p_type == ELF::PT_DYNAMIC) {
-      // Insert the new header before DYNAMIC.
-      ELF64LE::Phdr NewTextPhdr = createNewTextPhdr();
-      OS.write(reinterpret_cast<const char *>(&NewTextPhdr),
-               sizeof(NewTextPhdr));
-      if (NewWritableSegmentSize) {
-        ELF64LEPhdrTy NewWritablePhdr = createNewWritableSectionsPhdr();
-        OS.write(reinterpret_cast<const char *>(&NewWritablePhdr),
-                 sizeof(NewWritablePhdr));
+      break;
+    }
+    case ELF::PT_GNU_STACK:
+      if (opts::UseGnuStack) {
+        // Overwrite the header with the new text segment header.
+        NewPhdr = createNewTextPhdr();
+        ModdedGnuStack = true;
+      }
+      break;
+    case ELF::PT_DYNAMIC:
+      if (!opts::UseGnuStack) {
+        // Insert new headers before DYNAMIC.
+        writeNewSegmentPhdrs();
+        AddedSegment = true;
       }
-      AddedSegment = true;
+      break;
     }
     OS.write(reinterpret_cast<const char *>(&NewPhdr), sizeof(NewPhdr));
   }
 
   if (!opts::UseGnuStack && !AddedSegment) {
-    // Append the new header to the end of the table.
-    ELF64LE::Phdr NewTextPhdr = createNewTextPhdr();
-    OS.write(reinterpret_cast<const char *>(&NewTextPhdr), sizeof(NewTextPhdr));
-    if (NewWritableSegmentSize) {
-      ELF64LEPhdrTy NewWritablePhdr = createNewWritableSectionsPhdr();
-      OS.write(reinterpret_cast<const char *>(&NewWritablePhdr),
-               sizeof(NewWritablePhdr));
-    }
+    // Append new headers to the end of the table.
+    writeNewSegmentPhdrs();
   }
 
-  assert((!opts::UseGnuStack || ModdedGnuStack) &&
-         "could not find GNU_STACK program header to modify");
+  if (opts::UseGnuStack && !ModdedGnuStack) {
+    BC->errs()
+        << "BOLT-ERROR: could not find PT_GNU_STACK program header to modify\n";
+    exit(1);
+  }
 }
 
 namespace {
@@ -5698,7 +5767,8 @@ void RewriteInstance::writeEHFrameHeader() {
       BC->AsmInfo->getCodePointerSize()));
   check_error(std::move(Er), "failed to parse EH frame");
 
-  LLVM_DEBUG(dbgs() << "BOLT: writing a new .eh_frame_hdr\n");
+  LLVM_DEBUG(dbgs() << "BOLT: writing a new " << getEHFrameHdrSectionName()
+                    << '\n');
 
   NextAvailableAddress =
       appendPadding(Out->os(), NextAvailableAddress, EHFrameHdrAlign);
@@ -5716,16 +5786,17 @@ void RewriteInstance::writeEHFrameHeader() {
   const unsigned Flags = BinarySection::getFlags(/*IsReadOnly=*/true,
                                                  /*IsText=*/false,
                                                  /*IsAllocatable=*/true);
-  BinarySection *OldEHFrameHdrSection = getSection(".eh_frame_hdr");
+  BinarySection *OldEHFrameHdrSection = getSection(getEHFrameHdrSectionName());
   if (OldEHFrameHdrSection)
-    OldEHFrameHdrSection->setOutputName(getOrgSecPrefix() + ".eh_frame_hdr");
+    OldEHFrameHdrSection->setOutputName(getOrgSecPrefix() +
+                                        getEHFrameHdrSectionName());
 
   BinarySection &EHFrameHdrSec = BC->registerOrUpdateSection(
-      getNewSecPrefix() + ".eh_frame_hdr", ELF::SHT_PROGBITS, Flags, nullptr,
-      NewEHFrameHdr.size(), /*Alignment=*/1);
+      getNewSecPrefix() + getEHFrameHdrSectionName(), ELF::SHT_PROGBITS, Flags,
+      nullptr, NewEHFrameHdr.size(), /*Alignment=*/1);
   EHFrameHdrSec.setOutputFileOffset(EHFrameHdrFileOffset);
   EHFrameHdrSec.setOutputAddress(EHFrameHdrOutputAddress);
-  EHFrameHdrSec.setOutputName(".eh_frame_hdr");
+  EHFrameHdrSec.setOutputName(getEHFrameHdrSectionName());
 
   NextAvailableAddress += EHFrameHdrSec.getOutputSize();
 
diff --git a/bolt/test/RISCV/unnamed-sym-no-entry.c b/bolt/test/RISCV/fake-label-no-entry.c
index b4173506b213..bd125263101b 100644
--- a/bolt/test/RISCV/unnamed-sym-no-entry.c
+++ b/bolt/test/RISCV/fake-label-no-entry.c
@@ -5,12 +5,12 @@
 
 // RUN: %clang %cflags -g -Wl,-q -o %t %s
 
-/// Verify that the binary indeed contains an unnamed symbol at _start
+/// Verify that the binary indeed contains a fake label ".L0 " at _start.
 // RUN: llvm-readelf -s %t | FileCheck %s --check-prefix=CHECK-ELF
 // CHECK-ELF-DAG: [[#%x,START:]] {{.*}} FUNC GLOBAL DEFAULT [[#%d,SECTION:]] _start{{$}}
 // CHECK-ELF-DAG: [[#%x,START]] {{.*}} NOTYPE LOCAL DEFAULT [[#SECTION]] .L0 {{$}}
 
-/// Verify that BOLT did not create an extra entry point for the unnamed symbol
+/// Verify that BOLT did not create an extra entry point for the fake label.
 // RUN: llvm-bolt -o %t.bolt %t --print-cfg | FileCheck %s
 // CHECK: Binary Function "_start" after building cfg {
 // CHECK:  IsMultiEntry: 0
diff --git a/bolt/test/X86/fragment-lite.s b/bolt/test/X86/fragment-lite.s
index 97069bf8096e..32d1f5a98b64 100644
--- a/bolt/test/X86/fragment-lite.s
+++ b/bolt/test/X86/fragment-lite.s
@@ -3,35 +3,42 @@
 # RUN: split-file %s %t
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/main.s -o %t.o
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/baz.s -o %t.baz.o
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/baz2.s -o %t.baz2.o
 # RUN: link_fdata %s %t.o %t.main.fdata
 # RUN: link_fdata %s %t.baz.o %t.baz.fdata
-# RUN: merge-fdata %t.main.fdata %t.baz.fdata > %t.fdata
-# RUN: %clang %cflags %t.o %t.baz.o -o %t.exe -Wl,-q
+# RUN: link_fdata %s %t.baz2.o %t.baz2.fdata
+# RUN: merge-fdata %t.main.fdata %t.baz.fdata %t.baz2.fdata > %t.fdata
+# RUN: %clang %cflags %t.o %t.baz.o %t.baz2.o -o %t.exe -Wl,-q
 # RUN: llvm-bolt %t.exe -o %t.out --lite=1 --data %t.fdata -v=1 -print-cfg \
 # RUN:   2>&1 | FileCheck %s
 
 # CHECK: BOLT-INFO: processing main.cold.1 as a sibling of non-ignored function
-# CHECK: BOLT-INFO: processing foo.cold.1/1 as a sibling of non-ignored function
-# CHECK: BOLT-INFO: processing bar.cold.1/1 as a sibling of non-ignored function
+# CHECK: BOLT-INFO: processing foo.cold.1/1(*2) as a sibling of non-ignored function
+# CHECK: BOLT-INFO: processing bar.cold.1/1(*2) as a sibling of non-ignored function
 # CHECK: BOLT-INFO: processing baz.cold.1 as a sibling of non-ignored function
-# CHECK: BOLT-INFO: processing baz.cold.1/1 as a sibling of non-ignored function
+# CHECK: BOLT-INFO: processing baz.cold.1/1(*2) as a sibling of non-ignored function
+# CHECK: BOLT-INFO: processing baz.cold.1/2(*2) as a sibling of non-ignored function
 
 # CHECK: Binary Function "main.cold.1" after building cfg
 # CHECK: Parent : main
 
-# CHECK: Binary Function "foo.cold.1/1" after building cfg
+# CHECK: Binary Function "foo.cold.1/1(*2)" after building cfg
 # CHECK: Parent : foo
 
-# CHECK: Binary Function "bar.cold.1/1" after building cfg
-# CHECK: Parent : bar/1
+# CHECK: Binary Function "bar.cold.1/1(*2)" after building cfg
+# CHECK: Parent : bar/1(*2)
 
 # CHECK: Binary Function "baz.cold.1" after building cfg
 # CHECK: Parent : baz{{$}}
 
-# CHECK: Binary Function "baz.cold.1/1" after building cfg
-# CHECK: Parent : baz/1
+# CHECK: Binary Function "baz.cold.1/1(*2)" after building cfg
+# CHECK: Parent : baz/1(*2)
+
+# CHECK: Binary Function "baz.cold.1/2(*2)" after building cfg
+# CHECK: Parent : baz/2(*2)
 
 #--- main.s
+.file "main.s"
   .globl main
   .type main, %function
 main:
@@ -126,6 +133,7 @@ baz.cold.1:
 .size baz.cold.1, .-baz.cold.1
 
 #--- baz.s
+.file "baz.s"
   .local baz
   .type baz, %function
 baz:
@@ -149,3 +157,29 @@ baz.cold.1:
   retq
   .cfi_endproc
 .size baz.cold.1, .-baz.cold.1
+
+#--- baz2.s
+.file "baz2.s"
+  .local baz
+  .type baz, %function
+baz:
+  .cfi_startproc
+# FDATA: 0 [unknown] 0 1 baz/2 0 1 0
+  cmpl	$0x0, %eax
+  je	baz.cold.1
+  retq
+  .cfi_endproc
+.size baz, .-baz
+
+  .section .text.cold
+  .local baz.cold.1
+  .type baz.cold.1, %function
+baz.cold.1:
+  .cfi_startproc
+  pushq	%rbp
+  movq	%rsp, %rbp
+  movl	$0x0, %eax
+  popq	%rbp
+  retq
+  .cfi_endproc
+.size baz.cold.1, .-baz.cold.1
diff --git a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
index 84e99c7fafc7..10868129e76d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
@@ -967,7 +967,8 @@ approximateStandardConversionSequence(const TheCheck &Check, QualType From,
   // Get out the qualifiers of the original type. This will always be
   // re-applied to the WorkType to ensure it is the same qualification as the
   // original From was.
-  auto QualifiersToApply = From.split().Quals.getAsOpaqueValue();
+  auto FastQualifiersToApply = static_cast<unsigned>(
+      From.split().Quals.getAsOpaqueValue() & Qualifiers::FastMask);
 
   // LValue->RValue is irrelevant for the check, because it is a thing to be
   // done at a call site, and will be performed if need be performed.
@@ -993,7 +994,7 @@ approximateStandardConversionSequence(const TheCheck &Check, QualType From,
     // "const double -> double".
     LLVM_DEBUG(llvm::dbgs()
                << "--- approximateStdConv. Conversion between numerics.\n");
-    WorkType = QualType{ToBuiltin, QualifiersToApply};
+    WorkType = QualType{ToBuiltin, FastQualifiersToApply};
   }
 
   const auto *FromEnum = WorkType->getAs<EnumType>();
@@ -1002,7 +1003,7 @@ approximateStandardConversionSequence(const TheCheck &Check, QualType From,
     // Unscoped enumerations (or enumerations in C) convert to numerics.
     LLVM_DEBUG(llvm::dbgs()
                << "--- approximateStdConv. Unscoped enum to numeric.\n");
-    WorkType = QualType{ToBuiltin, QualifiersToApply};
+    WorkType = QualType{ToBuiltin, FastQualifiersToApply};
   } else if (FromNumeric && ToEnum && ToEnum->isUnscopedEnumerationType()) {
     // Numeric types convert to enumerations only in C.
     if (Ctx.getLangOpts().CPlusPlus) {
@@ -1013,7 +1014,7 @@ approximateStandardConversionSequence(const TheCheck &Check, QualType From,
 
     LLVM_DEBUG(llvm::dbgs()
                << "--- approximateStdConv. Numeric to unscoped enum.\n");
-    WorkType = QualType{ToEnum, QualifiersToApply};
+    WorkType = QualType{ToEnum, FastQualifiersToApply};
   }
 
   // Check for pointer conversions.
@@ -1022,14 +1023,14 @@ approximateStandardConversionSequence(const TheCheck &Check, QualType From,
   if (FromPtr && ToPtr) {
     if (ToPtr->isVoidPointerType()) {
       LLVM_DEBUG(llvm::dbgs() << "--- approximateStdConv. To void pointer.\n");
-      WorkType = QualType{ToPtr, QualifiersToApply};
+      WorkType = QualType{ToPtr, FastQualifiersToApply};
     }
 
     const auto *FromRecordPtr = FromPtr->getPointeeCXXRecordDecl();
     const auto *ToRecordPtr = ToPtr->getPointeeCXXRecordDecl();
     if (isDerivedToBase(FromRecordPtr, ToRecordPtr)) {
       LLVM_DEBUG(llvm::dbgs() << "--- approximateStdConv. Derived* to Base*\n");
-      WorkType = QualType{ToPtr, QualifiersToApply};
+      WorkType = QualType{ToPtr, FastQualifiersToApply};
     }
   }
 
@@ -1039,7 +1040,7 @@ approximateStandardConversionSequence(const TheCheck &Check, QualType From,
   const auto *ToRecord = To->getAsCXXRecordDecl();
   if (isDerivedToBase(FromRecord, ToRecord)) {
     LLVM_DEBUG(llvm::dbgs() << "--- approximateStdConv. Derived To Base.\n");
-    WorkType = QualType{ToRecord->getTypeForDecl(), QualifiersToApply};
+    WorkType = QualType{ToRecord->getTypeForDecl(), FastQualifiersToApply};
   }
 
   if (Ctx.getLangOpts().CPlusPlus17 && FromPtr && ToPtr) {
@@ -1054,7 +1055,7 @@ approximateStandardConversionSequence(const TheCheck &Check, QualType From,
         !ToFunctionPtr->hasNoexceptExceptionSpec()) {
       LLVM_DEBUG(llvm::dbgs() << "--- approximateStdConv. noexcept function "
                                  "pointer to non-noexcept.\n");
-      WorkType = QualType{ToPtr, QualifiersToApply};
+      WorkType = QualType{ToPtr, FastQualifiersToApply};
     }
   }
 
diff --git a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.h b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.h
index 6c32a4edb4ff..f1591bae4465 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.h
@@ -19,7 +19,7 @@ public:
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     // FIXME this should be CPlusPlus11 but that causes test cases to
     // erroneously fail.
-    return LangOpts.CPlusPlus;
+    return LangOpts.CPlusPlus || LangOpts.C23;
   }
   void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
diff --git a/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.cpp b/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.cpp
index 48bca41f4a3b..f077040a3529 100644
--- a/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.cpp
@@ -64,8 +64,11 @@ void AvoidReturnWithVoidValueCheck::check(
            << BraceInsertionHints.closingBraceFixIt();
   }
   Diag << FixItHint::CreateRemoval(VoidReturn->getReturnLoc());
-  if (!Result.Nodes.getNodeAs<FunctionDecl>("function_parent") ||
-      SurroundingBlock->body_back() != VoidReturn)
+  const auto *FunctionParent =
+      Result.Nodes.getNodeAs<FunctionDecl>("function_parent");
+  if (!FunctionParent ||
+      (SurroundingBlock && SurroundingBlock->body_back() != VoidReturn))
+    // If this is not the last statement in a function body, we add a `return`.
     Diag << FixItHint::CreateInsertion(SemicolonPos.getLocWithOffset(1),
                                        " return;", true);
 }
diff --git a/clang-tools-extra/clangd/CodeCompletionStrings.cpp b/clang-tools-extra/clangd/CodeCompletionStrings.cpp
index 2075e5965f18..9b4442b0bb76 100644
--- a/clang-tools-extra/clangd/CodeCompletionStrings.cpp
+++ b/clang-tools-extra/clangd/CodeCompletionStrings.cpp
@@ -253,7 +253,7 @@ void getSignature(const CodeCompletionString &CCS, std::string *Signature,
       if (!IncludeFunctionArguments &&
           ResultKind == CodeCompletionResult::RK_Declaration)
         TruncateSnippetAt.emplace(Snippet->size());
-      LLVM_FALLTHROUGH;
+      [[fallthrough]];
     case CodeCompletionString::CK_RightParen:
     case CodeCompletionString::CK_LeftBracket:
     case CodeCompletionString::CK_RightBracket:
diff --git a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
index 94437857cecc..799a549ff081 100644
--- a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
@@ -854,7 +854,7 @@ TEST_F(TargetDeclTest, DependentExprs) {
           }
         };
       )cpp";
-  EXPECT_DECLS("MemberExpr", "void foo()");
+  EXPECT_DECLS("CXXDependentScopeMemberExpr", "void foo()");
 
   // Similar to above but base expression involves a function call.
   Code = R"cpp(
@@ -872,7 +872,7 @@ TEST_F(TargetDeclTest, DependentExprs) {
           }
         };
       )cpp";
-  EXPECT_DECLS("MemberExpr", "void foo()");
+  EXPECT_DECLS("CXXDependentScopeMemberExpr", "void foo()");
 
   // Similar to above but uses a function pointer.
   Code = R"cpp(
@@ -891,7 +891,7 @@ TEST_F(TargetDeclTest, DependentExprs) {
           }
         };
       )cpp";
-  EXPECT_DECLS("MemberExpr", "void foo()");
+  EXPECT_DECLS("CXXDependentScopeMemberExpr", "void foo()");
 
   // Base expression involves a member access into this.
   Code = R"cpp(
@@ -962,7 +962,7 @@ TEST_F(TargetDeclTest, DependentExprs) {
           void Foo() { this->[[find]](); }
         };
   )cpp";
-  EXPECT_DECLS("MemberExpr", "void find()");
+  EXPECT_DECLS("CXXDependentScopeMemberExpr", "void find()");
 }
 
 TEST_F(TargetDeclTest, DependentTypes) {
diff --git a/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp b/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp
index 30b9b1902aa9..4156921d83ed 100644
--- a/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp
@@ -621,7 +621,7 @@ sizeof...($TemplateParameter[[Elements]]);
       struct $Class_def[[Foo]] {
         int $Field_decl[[Waldo]];
         void $Method_def[[bar]]() {
-          $Class[[Foo]]().$Field[[Waldo]];
+          $Class[[Foo]]().$Field_dependentName[[Waldo]];
         }
         template $Bracket[[<]]typename $TemplateParameter_def[[U]]$Bracket[[>]]
         void $Method_def[[bar1]]() {
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 2867fc958030..3038d2b125f2 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -100,13 +100,15 @@ Improvements to clang-tidy
 - Improved :program:`run-clang-tidy.py` script. Added argument `-source-filter`
   to filter source files from the compilation database, via a RegEx. In a
   similar fashion to what `-header-filter` does for header files.
+
 - Improved :program:`check_clang_tidy.py` script. Added argument `-export-fixes`
   to aid in clang-tidy and test development.
+
 - Fixed bug where big values for unsigned check options overflowed into negative values
-  when being printed with ``--dump-config``.
+  when being printed with `--dump-config`.
 
-- Fixed ``--verify-config`` option not properly parsing checks when using the 
-  literal operator in the ``.clang-tidy`` config.
+- Fixed `--verify-config` option not properly parsing checks when using the
+  literal operator in the `.clang-tidy` config.
 
 New checks
 ^^^^^^^^^^
@@ -236,7 +238,7 @@ Changes in existing checks
 
 - Improved :doc:`google-explicit-constructor
   <clang-tidy/checks/google/explicit-constructor>` check to better handle
-  ``C++-20`` `explicit(bool)`.
+  C++20 `explicit(bool)`.
 
 - Improved :doc:`google-global-names-in-headers
   <clang-tidy/checks/google/global-names-in-headers>` check by replacing the local
@@ -249,6 +251,10 @@ Changes in existing checks
   check by ignoring other functions with same prefixes as the target specific
   functions.
 
+- Improved :doc:`linuxkernel-must-check-errs
+  <clang-tidy/checks/linuxkernel/must-check-errs>` check documentation to
+  consistently use the check's proper name.
+
 - Improved :doc:`llvm-header-guard
   <clang-tidy/checks/llvm/header-guard>` check by replacing the local
   option `HeaderFileExtensions` by the global option of the same name.
@@ -281,6 +287,10 @@ Changes in existing checks
   don't remove parentheses used in ``sizeof`` calls when they have array index
   accesses as arguments.
 
+- Improved :doc:`modernize-use-nullptr
+  <clang-tidy/checks/modernize/use-nullptr>` check to include support for C23,
+  which also has introduced the ``nullptr`` keyword.
+
 - Improved :doc:`modernize-use-override
   <clang-tidy/checks/modernize/use-override>` check to also remove any trailing
   whitespace when deleting the ``virtual`` keyword.
@@ -336,13 +346,9 @@ Miscellaneous
 ^^^^^^^^^^^^^
 
 - Fixed incorrect formatting in :program:`clang-apply-replacements` when no
-  ``--format`` option is specified. Now :program:`clang-apply-replacements`
+  `--format` option is specified. Now :program:`clang-apply-replacements`
   applies formatting only with the option.
 
-- Fixed the :doc:`linuxkernel-must-check-errs
-  <clang-tidy/checks/linuxkernel/must-check-errs>` documentation to consistently
-  use the check's proper name.
-
 Improvements to include-fixer
 -----------------------------
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/env33-c.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/env33-c.rst
index c5321b07f7f8..9271c9ecccc0 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cert/env33-c.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cert/env33-c.rst
@@ -10,4 +10,4 @@ but does not actually attempt to execute a command.
 
 This check corresponds to the CERT C Coding Standard rule
 `ENV33-C. Do not call system()
-<https://www.securecoding.cert.org/confluence/pages/viewpage.action?pageId=2130132>`_.
+<https://www.securecoding.cert.org/confluence/display/c/ENV33-C.+Do+not+call+system()>`_.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-nullptr.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-nullptr.rst
index 5e1ba858adf3..25e17fee0a3d 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-nullptr.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-nullptr.rst
@@ -4,7 +4,7 @@ modernize-use-nullptr
 =====================
 
 The check converts the usage of null pointer constants (e.g. ``NULL``, ``0``)
-to use the new C++11 ``nullptr`` keyword.
+to use the new C++11 and C23 ``nullptr`` keyword.
 
 Example
 -------
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/owning-memory.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/owning-memory.cpp
index ae61b17ca14d..574efe7bd914 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/owning-memory.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/owning-memory.cpp
@@ -309,8 +309,6 @@ struct HeapArray {                                          // Ok, since destruc
 
   HeapArray(HeapArray &&other) : _data(other._data), size(other.size) { // Ok
     other._data = nullptr;                                              // Ok
-    // CHECK-NOTES: [[@LINE-1]]:5: warning: expected assignment source to be of type 'gsl::owner<>'; got 'std::nullptr_t'
-    // FIXME: This warning is emitted because an ImplicitCastExpr for the NullToPointer conversion isn't created for dependent types.
     other.size = 0;
   }
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-equals-default-copy.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-equals-default-copy.cpp
index 4abb9c855597..559031cf4d9b 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-equals-default-copy.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-equals-default-copy.cpp
@@ -260,8 +260,6 @@ template <class T>
 struct Template {
   Template() = default;
   Template(const Template &Other) : Field(Other.Field) {}
-  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use '= default'
-  // CHECK-FIXES: Template(const Template &Other)  = default;
   Template &operator=(const Template &Other);
   void foo(const T &t);
   int Field;
@@ -271,12 +269,8 @@ Template<T> &Template<T>::operator=(const Template<T> &Other) {
   Field = Other.Field;
   return *this;
 }
-// CHECK-MESSAGES: :[[@LINE-4]]:27: warning: use '= default'
-// CHECK-FIXES: Template<T> &Template<T>::operator=(const Template<T> &Other) = default;
-
 Template<int> T1;
 
-
 // Dependent types.
 template <class T>
 struct DT1 {
@@ -290,9 +284,6 @@ DT1<T> &DT1<T>::operator=(const DT1<T> &Other) {
   Field = Other.Field;
   return *this;
 }
-// CHECK-MESSAGES: :[[@LINE-4]]:17: warning: use '= default'
-// CHECK-FIXES: DT1<T> &DT1<T>::operator=(const DT1<T> &Other) = default;
-
 DT1<int> Dt1;
 
 template <class T>
@@ -312,9 +303,6 @@ DT2<T> &DT2<T>::operator=(const DT2<T> &Other) {
 struct T {
   typedef int TT;
 };
-// CHECK-MESSAGES: :[[@LINE-8]]:17: warning: use '= default'
-// CHECK-FIXES: DT2<T> &DT2<T>::operator=(const DT2<T> &Other) = default;
-
 DT2<T> Dt2;
 
 // Default arguments.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr-c23.c b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr-c23.c
new file mode 100644
index 000000000000..6fb879b91e41
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr-c23.c
@@ -0,0 +1,139 @@
+// RUN: %check_clang_tidy %s modernize-use-nullptr %t -- -- -std=c23
+
+#define NULL 0
+
+void test_assignment() {
+  int *p1 = 0;
+  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: use nullptr [modernize-use-nullptr]
+  // CHECK-FIXES: int *p1 = nullptr;
+  p1 = 0;
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: use nullptr
+  // CHECK-FIXES: p1 = nullptr;
+
+  int *p2 = NULL;
+  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: use nullptr
+  // CHECK-FIXES: int *p2 = nullptr;
+
+  p2 = p1;
+  // CHECK-FIXES: p2 = p1;
+
+  const int null = 0;
+  int *p3 = &null;
+
+  p3 = NULL;
+  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: use nullptr
+  // CHECK-FIXES: p3 = nullptr;
+
+  int *p4 = p3;
+
+  int i1 = 0;
+
+  int i2 = NULL;
+
+  int i3 = null;
+
+  int *p5, *p6, *p7;
+  p5 = p6 = p7 = NULL;
+  // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: use nullptr
+  // CHECK-FIXES: p5 = p6 = p7 = nullptr;
+}
+
+void test_function(int *p) {}
+
+void test_function_no_ptr_param(int i) {}
+
+void test_function_call() {
+  test_function(0);
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: use nullptr
+  // CHECK-FIXES: test_function(nullptr);
+
+  test_function(NULL);
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: use nullptr
+  // CHECK-FIXES: test_function(nullptr);
+
+  test_function_no_ptr_param(0);
+}
+
+char *test_function_return1() {
+  return 0;
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: use nullptr
+  // CHECK-FIXES: return nullptr;
+}
+
+void *test_function_return2() {
+  return NULL;
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: use nullptr
+  // CHECK-FIXES: return nullptr;
+}
+
+int test_function_return4() {
+  return 0;
+}
+
+int test_function_return5() {
+  return NULL;
+}
+
+int *test_function_return_cast1() {
+  return(int)0;
+  // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: use nullptr
+  // CHECK-FIXES: return nullptr;
+}
+
+int *test_function_return_cast2() {
+#define RET return
+  RET(int)0;
+  // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: use nullptr
+  // CHECK-FIXES: RET nullptr;
+#undef RET
+}
+
+// Test parentheses expressions resulting in a nullptr.
+int *test_parentheses_expression1() {
+  return(0);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: use nullptr
+  // CHECK-FIXES: return(nullptr);
+}
+
+int *test_parentheses_expression2() {
+  return((int)(0.0f));
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: use nullptr
+  // CHECK-FIXES: return(nullptr);
+}
+
+int *test_nested_parentheses_expression() {
+  return((((0))));
+  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: use nullptr
+  // CHECK-FIXES: return((((nullptr))));
+}
+
+void test_const_pointers() {
+  const int *const_p1 = 0;
+  // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: use nullptr
+  // CHECK-FIXES: const int *const_p1 = nullptr;
+  const int *const_p2 = NULL;
+  // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: use nullptr
+  // CHECK-FIXES: const int *const_p2 = nullptr;
+  const int *const_p3 = (int)0;
+  // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: use nullptr
+  // CHECK-FIXES: const int *const_p3 = nullptr;
+  const int *const_p4 = (int)0.0f;
+  // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: use nullptr
+  // CHECK-FIXES: const int *const_p4 = nullptr;
+}
+
+void test_nested_implicit_cast_expr() {
+  int func0(void*, void*);
+  int func1(int, void*, void*);
+
+  (double)func1(0, 0, 0);
+  // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: use nullptr
+  // CHECK-MESSAGES: :[[@LINE-2]]:23: warning: use nullptr
+  // CHECK-FIXES: (double)func1(0, nullptr, nullptr);
+  (double)func1(func0(0, 0), 0, 0);
+  // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: use nullptr
+  // CHECK-MESSAGES: :[[@LINE-2]]:26: warning: use nullptr
+  // CHECK-MESSAGES: :[[@LINE-3]]:30: warning: use nullptr
+  // CHECK-MESSAGES: :[[@LINE-4]]:33: warning: use nullptr
+  // CHECK-FIXES: (double)func1(func0(nullptr, nullptr), nullptr, nullptr);
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr.c b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr.c
index c2ccbbd81171..1218b837199c 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr.c
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr.c
@@ -1,4 +1,4 @@
-// RUN: clang-tidy %s -checks=-*,modernize-use-nullptr -- | count 0
+// RUN: clang-tidy %s -checks=-*,modernize-use-nullptr -- -std=c17 | count 0
 
 // Note: this test expects no diagnostics, but FileCheck cannot handle that,
 // hence the use of | count 0.
diff --git a/clang-tools-extra/test/pp-trace/pp-trace-pragma-general.cpp b/clang-tools-extra/test/pp-trace/pp-trace-pragma-general.cpp
index f01ebd1ec67d..b16ec56e321b 100644
--- a/clang-tools-extra/test/pp-trace/pp-trace-pragma-general.cpp
+++ b/clang-tools-extra/test/pp-trace/pp-trace-pragma-general.cpp
@@ -21,6 +21,12 @@ void foo() {
 
 // CHECK: ---
 // CHECK-NEXT: - Callback: PragmaDirective
+// CHECK-NEXT:   Loc: "<built-in>:{{.+}}:1"
+// CHECK-NEXT:   Introducer: PIK_HashPragma
+// CHECK-NEXT: - Callback: PragmaDirective
+// CHECK-NEXT:   Loc: "<built-in>:{{.+}}:1"
+// CHECK-NEXT:   Introducer: PIK_HashPragma
+// CHECK-NEXT: - Callback: PragmaDirective
 // CHECK-NEXT:   Loc: "{{.*}}{{[/\\]}}pp-trace-pragma-general.cpp:3:1"
 // CHECK-NEXT:   Introducer: PIK_HashPragma
 // CHECK-NEXT: - Callback: PragmaDiagnosticPush
diff --git a/clang-tools-extra/test/pp-trace/pp-trace-pragma-ms.cpp b/clang-tools-extra/test/pp-trace/pp-trace-pragma-ms.cpp
index 932b0eb93c90..f5bf9ac2b955 100644
--- a/clang-tools-extra/test/pp-trace/pp-trace-pragma-ms.cpp
+++ b/clang-tools-extra/test/pp-trace/pp-trace-pragma-ms.cpp
@@ -18,6 +18,12 @@
 
 // CHECK: ---
 // CHECK-NEXT: - Callback: PragmaDirective
+// CHECK-NEXT:   Loc: "<built-in>:{{.+}}:1"
+// CHECK-NEXT:   Introducer: PIK_HashPragma
+// CHECK-NEXT: - Callback: PragmaDirective
+// CHECK-NEXT:   Loc: "<built-in>:{{.+}}:1"
+// CHECK-NEXT:   Introducer: PIK_HashPragma
+// CHECK-NEXT: - Callback: PragmaDirective
 // CHECK-NEXT:   Loc: "{{.*}}{{[/\\]}}pp-trace-pragma-ms.cpp:3:1"
 // CHECK-NEXT:   Introducer: PIK_HashPragma
 // CHECK-NEXT: - Callback: PragmaComment
@@ -67,7 +73,7 @@
 // CHECK-NEXT:   Introducer: PIK_HashPragma
 // CHECK-NEXT: - Callback: PragmaMessage
 // CHECK-NEXT:   Loc: "{{.*}}{{[/\\]}}pp-trace-pragma-ms.cpp:13:9"
-// CHECK-NEXT:   Namespace: 
+// CHECK-NEXT:   Namespace:
 // CHECK-NEXT:   Kind: PMK_Message
 // CHECK-NEXT:   Str: message argument
 // CHECK-NEXT: - Callback: PragmaDirective
diff --git a/clang-tools-extra/test/pp-trace/pp-trace-pragma-opencl.cpp b/clang-tools-extra/test/pp-trace/pp-trace-pragma-opencl.cpp
index 31f61027994f..ed33d37eb3d5 100644
--- a/clang-tools-extra/test/pp-trace/pp-trace-pragma-opencl.cpp
+++ b/clang-tools-extra/test/pp-trace/pp-trace-pragma-opencl.cpp
@@ -6,6 +6,12 @@
 
 // CHECK: ---
 // CHECK-NEXT: - Callback: PragmaDirective
+// CHECK-NEXT:   Loc: "<built-in>:{{.+}}:1"
+// CHECK-NEXT:   Introducer: PIK_HashPragma
+// CHECK-NEXT: - Callback: PragmaDirective
+// CHECK-NEXT:   Loc: "<built-in>:{{.+}}:1"
+// CHECK-NEXT:   Introducer: PIK_HashPragma
+// CHECK-NEXT: - Callback: PragmaDirective
 // CHECK-NEXT:   Loc: "{{.*}}{{[/\\]}}pp-trace-pragma-opencl.cpp:3:1"
 // CHECK-NEXT:   Introducer: PIK_HashPragma
 // CHECK-NEXT: - Callback: PragmaOpenCLExtension
diff --git a/clang/cmake/caches/Release.cmake b/clang/cmake/caches/Release.cmake
index c164d5497275..c0bfcbdfc1c2 100644
--- a/clang/cmake/caches/Release.cmake
+++ b/clang/cmake/caches/Release.cmake
@@ -82,6 +82,7 @@ set(LLVM_ENABLE_PROJECTS ${STAGE1_PROJECTS} CACHE STRING "")
 # stage2-instrumented and Final Stage Config:
 # Options that need to be set in both the instrumented stage (if we are doing
 # a pgo build) and the final stage.
+set_instrument_and_final_stage_var(CMAKE_POSITION_INDEPENDENT_CODE "ON" STRING)
 set_instrument_and_final_stage_var(LLVM_ENABLE_LTO "${LLVM_RELEASE_ENABLE_LTO}" STRING)
 if (LLVM_RELEASE_ENABLE_LTO)
   set_instrument_and_final_stage_var(LLVM_ENABLE_LLD "ON" BOOL)
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 84fc4dee02fa..c2e90f4e7d58 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -711,6 +711,8 @@ even-odd element pair with indices ``i * 2`` and ``i * 2 + 1`` with
 power of 2, the vector is widened with neutral elements for the reduction
 at the end to the next power of 2.
 
+These reductions support both fixed-sized and scalable vector types.
+
 Example:
 
 .. code-block:: c++
@@ -1493,6 +1495,7 @@ Conditional ``explicit``                     __cpp_conditional_explicit       C+
 ``if consteval``                             __cpp_if_consteval               C++23         C++20
 ``static operator()``                        __cpp_static_call_operator       C++23         C++03
 Attributes on Lambda-Expressions                                              C++23         C++11
+Attributes on Structured Bindings            __cpp_structured_bindings        C++26         C++03
 ``= delete ("should have a reason");``       __cpp_deleted_function           C++26         C++03
 -------------------------------------------- -------------------------------- ------------- -------------
 Designated initializers (N494)                                                C99           C89
@@ -2928,7 +2931,7 @@ Query for this feature with ``__has_builtin(__builtin_dump_struct)``
 ``__builtin_shufflevector`` is used to express generic vector
 permutation/shuffle/swizzle operations.  This builtin is also very important
 for the implementation of various target-specific header files like
-``<xmmintrin.h>``.
+``<xmmintrin.h>``. This builtin can be used within constant expressions.
 
 **Syntax**:
 
@@ -2955,7 +2958,7 @@ for the implementation of various target-specific header files like
   // Concatenate every other element of 8-element vectors V1 and V2.
   __builtin_shufflevector(V1, V2, 0, 2, 4, 6, 8, 10, 12, 14)
 
-  // Shuffle v1 with some elements being undefined
+  // Shuffle v1 with some elements being undefined. Not allowed in constexpr.
   __builtin_shufflevector(v1, v1, 3, -1, 1, -1)
 
 **Description**:
@@ -2968,6 +2971,7 @@ starting with the first vector, continuing into the second vector.  Thus, if
 ``vec1`` is a 4-element vector, index 5 would refer to the second element of
 ``vec2``. An index of -1 can be used to indicate that the corresponding element
 in the returned vector is a don't care and can be optimized by the backend.
+Values of -1 are not supported in constant expressions.
 
 The result of ``__builtin_shufflevector`` is a vector with the same element
 type as ``vec1``/``vec2`` but that has an element count equal to the number of
@@ -2982,7 +2986,8 @@ Query for this feature with ``__has_builtin(__builtin_shufflevector)``.
 
 ``__builtin_convertvector`` is used to express generic vector
 type-conversion operations. The input vector and the output vector
-type must have the same number of elements.
+type must have the same number of elements. This builtin can be used within
+constant expressions.
 
 **Syntax**:
 
@@ -5572,3 +5577,25 @@ but the expression has no runtime effects.
 Type- and value-dependent expressions are not supported yet.
 
 This facility is designed to aid with testing name lookup machinery.
+
+Predefined Macros
+=================
+
+`__GCC_DESTRUCTIVE_SIZE` and `__GCC_CONSTRUCTIVE_SIZE`
+------------------------------------------------------
+Specify the mimum offset between two objects to avoid false sharing and the
+maximum size of contiguous memory to promote true sharing, respectively. These
+macros are predefined in all C and C++ language modes, but can be redefined on
+the command line with ``-D`` to specify different values as needed or can be
+undefined on the command line with ``-U`` to disable support for the feature.
+
+**Note: the values the macros expand to are not guaranteed to be stable. They
+are are affected by architectures and CPU tuning flags, can change between
+releases of Clang and will not match the values defined by other compilers such
+as GCC.**
+
+Compiling different TUs depending on these flags (including use of
+``std::hardware_constructive_interference`` or
+``std::hardware_destructive_interference``)  with different compilers, macro
+definitions, or architecture flags will lead to ODR violations and should be
+avoided.
+\ No newline at end of file
diff --git a/clang/docs/LibTooling.rst b/clang/docs/LibTooling.rst
index df50dcebf9b8..87d84321ab28 100644
--- a/clang/docs/LibTooling.rst
+++ b/clang/docs/LibTooling.rst
@@ -63,15 +63,22 @@ and automatic location of the compilation database using source files paths.
   #include "llvm/Support/CommandLine.h"
 
   using namespace clang::tooling;
+  using namespace llvm;
 
   // Apply a custom category to all command-line options so that they are the
   // only ones displayed.
-  static llvm::cl::OptionCategory MyToolCategory("my-tool options");
+  static cl::OptionCategory MyToolCategory("my-tool options");
 
   int main(int argc, const char **argv) {
-    // CommonOptionsParser constructor will parse arguments and create a
-    // CompilationDatabase.  In case of error it will terminate the program.
-    CommonOptionsParser OptionsParser(argc, argv, MyToolCategory);
+    // CommonOptionsParser::create will parse arguments and create a
+    // CompilationDatabase.
+    auto ExpectedParser = CommonOptionsParser::create(argc, argv, MyToolCategory);
+    if (!ExpectedParser) {
+      // Fail gracefully for unsupported options.
+      llvm::errs() << ExpectedParser.takeError();
+      return 1;
+    }
+    CommonOptionsParser& OptionsParser = ExpectedParser.get();
 
     // Use OptionsParser.getCompilations() and OptionsParser.getSourcePathList()
     // to retrieve CompilationDatabase and the list of input file paths.
@@ -133,7 +140,12 @@ version of this example tool is also checked into the clang tree at
   static cl::extrahelp MoreHelp("\nMore help text...\n");
 
   int main(int argc, const char **argv) {
-    CommonOptionsParser OptionsParser(argc, argv, MyToolCategory);
+    auto ExpectedParser = CommonOptionsParser::create(argc, argv, MyToolCategory);
+    if (!ExpectedParser) {
+      llvm::errs() << ExpectedParser.takeError();
+      return 1;
+    }
+    CommonOptionsParser& OptionsParser = ExpectedParser.get();
     ClangTool Tool(OptionsParser.getCompilations(),
                    OptionsParser.getSourcePathList());
     return Tool.run(newFrontendActionFactory<clang::SyntaxOnlyAction>().get());
diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index f8146bc365e8..5e63b2c0f0be 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -310,7 +310,9 @@ implementation.
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | misc                         | dispatch construct and function variant argument adjustment  | :part:`worked on`        | D99537, D99679                                                        |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| misc                         | assume and assumes directives                                | :part:`worked on`        |                                                                       |
+| misc                         | assumes directives                                           | :part:`worked on`        |                                                                       |
++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
+| misc                         | assume directive                                             | :part:`worked on`        |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | misc                         | nothing directive                                            | :good:`done`             | D123286                                                               |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 00c684e773a2..2d0e663d93e4 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -90,6 +90,18 @@ C++ Language Changes
 --------------------
 - Implemented ``_BitInt`` literal suffixes ``__wb`` or ``__WB`` as a Clang extension with ``unsigned`` modifiers also allowed. (#GH85223).
 
+C++17 Feature Support
+^^^^^^^^^^^^^^^^^^^^^
+- Clang now exposes ``__GCC_DESTRUCTIVE_SIZE`` and ``__GCC_CONSTRUCTIVE_SIZE``
+  predefined macros to support standard library implementations of
+  ``std::hardware_destructive_interference_size`` and
+  ``std::hardware_constructive_interference_size``, respectively. These macros
+  are predefined in all C and C++ language modes. The values the macros
+  expand to are not stable between releases of Clang and do not need to match
+  the values produced by GCC, so these macros should not be used from header
+  files because they may not be stable across multiple TUs (the values may vary
+  based on compiler version as well as CPU tuning). #GH60174
+
 C++20 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
@@ -131,6 +143,9 @@ C++2c Feature Support
 
 - Implemented `P2573R2: = delete("should have a reason"); <https://wg21.link/P2573R2>`_
 
+- Implemented `P0609R3: Attributes for Structured Bindings <https://wg21.link/P0609R3>`_
+
+- Implemented `P2748R5 Disallow Binding a Returned Glvalue to a Temporary <https://wg21.link/P2748R5>`_.
 
 Resolutions to C++ Defect Reports
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -207,6 +222,7 @@ Non-comprehensive list of changes in this release
 - ``__typeof_unqual__`` is available in all C modes as an extension, which behaves
   like ``typeof_unqual`` from C23, similar to ``__typeof__`` and ``typeof``.
 
+- ``__builtin_reduce_{add|mul|xor|or|and|min|max}`` builtins now support scalable vectors.
 
 * Shared libraries linked with either the ``-ffast-math``, ``-Ofast``, or
   ``-funsafe-math-optimizations`` flags will no longer enable flush-to-zero
@@ -217,6 +233,9 @@ Non-comprehensive list of changes in this release
 * ``-fdenormal-fp-math=preserve-sign`` is no longer implied by ``-ffast-math``
   on x86 systems.
 
+- Builtins ``__builtin_shufflevector()`` and ``__builtin_convertvector()`` may
+  now be used within constant expressions.
+
 New Compiler Flags
 ------------------
 - ``-fsanitize=implicit-bitfield-conversion`` checks implicit truncation and
@@ -385,18 +404,6 @@ Improvements to Clang's diagnostics
 
 - Clang now diagnoses requires expressions with explicit object parameters.
 
-- Clang now looks up members of the current instantiation in the template definition context
-  if the current instantiation has no dependent base classes.
-
-  .. code-block:: c++
-
-     template<typename T>
-     struct A {
-       int f() {
-         return this->x; // error: no member named 'x' in 'A<T>'
-       }
-     };
-
 Improvements to Clang's time-trace
 ----------------------------------
 
@@ -458,6 +465,10 @@ Bug Fixes in This Version
 
 - Fixed an assertion failure on invalid InitListExpr in C89 mode (#GH88008).
 
+- Fixed missing destructor calls when we branch from middle of an expression.
+  This could happen through a branch in stmt-expr or in an expression containing a coroutine
+  suspension. Fixes (#GH63818) (#GH88478).
+
 - Clang will no longer diagnose an erroneous non-dependent ``switch`` condition
   during instantiation, and instead will only diagnose it once, during checking
   of the function template.
@@ -588,6 +599,8 @@ Bug Fixes to C++ Support
 - Fixed a use-after-free bug in parsing of type constraints with default arguments that involve lambdas. (#GH67235)
 - Fixed bug in which the body of a consteval lambda within a template was not parsed as within an
   immediate function context.
+- Fix CTAD for ``std::initializer_list``. This allows ``std::initializer_list{1, 2, 3}`` to be deduced as
+  ``std::initializer_list<int>`` as intended.
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -640,6 +653,9 @@ Arm and AArch64 Support
     * Arm Cortex-A78AE (cortex-a78ae).
     * Arm Cortex-A520AE (cortex-a520ae).
     * Arm Cortex-A720AE (cortex-a720ae).
+    * Arm Neoverse-N3 (neoverse-n3).
+    * Arm Neoverse-V3 (neoverse-v3).
+    * Arm Neoverse-V3AE (neoverse-v3ae).
 
 Android Support
 ^^^^^^^^^^^^^^^
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index d0326f01d251..a7fc7ec87a16 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -2319,6 +2319,8 @@ are listed below.
    on ELF targets when using the integrated assembler. This flag currently
    only has an effect on ELF targets.
 
+.. _funique_internal_linkage_names:
+
 .. option:: -f[no]-unique-internal-linkage-names
 
    Controls whether Clang emits a unique (best-effort) symbol name for internal
@@ -2448,27 +2450,41 @@ usual build cycle when using sample profilers for optimization:
    usual build flags that you always build your application with. The only
    requirement is that DWARF debug info including source line information is
    generated. This DWARF information is important for the profiler to be able
-   to map instructions back to source line locations.
+   to map instructions back to source line locations. The usefulness of this
+   DWARF information can be improved with the ``-fdebug-info-for-profiling``
+   and ``-funique-internal-linkage-names`` options.
 
-   On Linux, ``-g`` or just ``-gline-tables-only`` is sufficient:
+   On Linux:
 
    .. code-block:: console
 
-     $ clang++ -O2 -gline-tables-only code.cc -o code
+     $ clang++ -O2 -gline-tables-only \
+       -fdebug-info-for-profiling -funique-internal-linkage-names \
+       code.cc -o code
 
    While MSVC-style targets default to CodeView debug information, DWARF debug
    information is required to generate source-level LLVM profiles. Use
    ``-gdwarf`` to include DWARF debug information:
 
-   .. code-block:: console
+   .. code-block:: winbatch
+
+     > clang-cl /O2 -gdwarf -gline-tables-only ^
+       /clang:-fdebug-info-for-profiling /clang:-funique-internal-linkage-names ^
+       code.cc /Fe:code /fuse-ld=lld /link /debug:dwarf
+
+.. note::
 
-     $ clang-cl -O2 -gdwarf -gline-tables-only coff-profile.cpp -fuse-ld=lld -link -debug:dwarf
+   :ref:`-funique-internal-linkage-names <funique_internal_linkage_names>`
+   generates unique names based on given command-line source file paths. If
+   your build system uses absolute source paths and these paths may change
+   between steps 1 and 4, then the uniqued function names may change and result
+   in unused profile data. Consider omitting this option in such cases.
 
 2. Run the executable under a sampling profiler. The specific profiler
    you use does not really matter, as long as its output can be converted
    into the format that the LLVM optimizer understands.
 
-   Two such profilers are the the Linux Perf profiler
+   Two such profilers are the Linux Perf profiler
    (https://perf.wiki.kernel.org/) and Intel's Sampling Enabling Product (SEP),
    available as part of `Intel VTune
    <https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/vtune-profiler.html>`_.
@@ -2482,7 +2498,9 @@ usual build cycle when using sample profilers for optimization:
 
    .. code-block:: console
 
-     $ perf record -b ./code
+     $ perf record -b -e BR_INST_RETIRED.NEAR_TAKEN:uppp ./code
+
+   If the event above is unavailable, ``branches:u`` is probably next-best.
 
    Note the use of the ``-b`` flag. This tells Perf to use the Last Branch
    Record (LBR) to record call chains. While this is not strictly required,
@@ -2532,21 +2550,42 @@ usual build cycle when using sample profilers for optimization:
    that executes faster than the original one. Note that you are not
    required to build the code with the exact same arguments that you
    used in the first step. The only requirement is that you build the code
-   with ``-gline-tables-only`` and ``-fprofile-sample-use``.
+   with the same debug info options and ``-fprofile-sample-use``.
+
+   On Linux:
 
    .. code-block:: console
 
-     $ clang++ -O2 -gline-tables-only -fprofile-sample-use=code.prof code.cc -o code
+     $ clang++ -O2 -gline-tables-only \
+       -fdebug-info-for-profiling -funique-internal-linkage-names \
+       -fprofile-sample-use=code.prof code.cc -o code
 
-  [OPTIONAL] Sampling-based profiles can have inaccuracies or missing block/
-  edge counters. The profile inference algorithm (profi) can be used to infer
-  missing blocks and edge counts, and improve the quality of profile data.
-  Enable it with ``-fsample-profile-use-profi``.
+   On Windows:
 
-  .. code-block:: console
+   .. code-block:: winbatch
+
+     > clang-cl /O2 -gdwarf -gline-tables-only ^
+       /clang:-fdebug-info-for-profiling /clang:-funique-internal-linkage-names ^
+       /fprofile-sample-use=code.prof code.cc /Fe:code /fuse-ld=lld /link /debug:dwarf
+
+   [OPTIONAL] Sampling-based profiles can have inaccuracies or missing block/
+   edge counters. The profile inference algorithm (profi) can be used to infer
+   missing blocks and edge counts, and improve the quality of profile data.
+   Enable it with ``-fsample-profile-use-profi``. For example, on Linux:
+
+   .. code-block:: console
+
+     $ clang++ -fsample-profile-use-profi -O2 -gline-tables-only \
+       -fdebug-info-for-profiling -funique-internal-linkage-names \
+       -fprofile-sample-use=code.prof code.cc -o code
+
+   On Windows:
+
+   .. code-block:: winbatch
 
-    $ clang++ -O2 -gline-tables-only -fprofile-sample-use=code.prof \
-      -fsample-profile-use-profi code.cc -o code
+     > clang-cl /clang:-fsample-profile-use-profi /O2 -gdwarf -gline-tables-only ^
+       /clang:-fdebug-info-for-profiling /clang:-funique-internal-linkage-names ^
+       /fprofile-sample-use=code.prof code.cc /Fe:code /fuse-ld=lld /link /debug:dwarf
 
 Sample Profile Formats
 """"""""""""""""""""""
diff --git a/clang/include/clang/APINotes/Types.h b/clang/include/clang/APINotes/Types.h
index 93bb045d6a66..026a4a431e73 100644
--- a/clang/include/clang/APINotes/Types.h
+++ b/clang/include/clang/APINotes/Types.h
@@ -675,6 +675,11 @@ class TagInfo : public CommonTypeInfo {
   LLVM_PREFERRED_TYPE(bool)
   unsigned IsFlagEnum : 1;
 
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned SwiftCopyableSpecified : 1;
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned SwiftCopyable : 1;
+
 public:
   std::optional<std::string> SwiftImportAs;
   std::optional<std::string> SwiftRetainOp;
@@ -682,7 +687,9 @@ public:
 
   std::optional<EnumExtensibilityKind> EnumExtensibility;
 
-  TagInfo() : HasFlagEnum(0), IsFlagEnum(0) {}
+  TagInfo()
+      : HasFlagEnum(0), IsFlagEnum(0), SwiftCopyableSpecified(false),
+        SwiftCopyable(false) {}
 
   std::optional<bool> isFlagEnum() const {
     if (HasFlagEnum)
@@ -694,6 +701,15 @@ public:
     IsFlagEnum = Value.value_or(false);
   }
 
+  std::optional<bool> isSwiftCopyable() const {
+    return SwiftCopyableSpecified ? std::optional<bool>(SwiftCopyable)
+                                  : std::nullopt;
+  }
+  void setSwiftCopyable(std::optional<bool> Value) {
+    SwiftCopyableSpecified = Value.has_value();
+    SwiftCopyable = Value.value_or(false);
+  }
+
   TagInfo &operator|=(const TagInfo &RHS) {
     static_cast<CommonTypeInfo &>(*this) |= RHS;
 
@@ -710,6 +726,9 @@ public:
     if (!EnumExtensibility)
       EnumExtensibility = RHS.EnumExtensibility;
 
+    if (!SwiftCopyableSpecified)
+      setSwiftCopyable(RHS.isSwiftCopyable());
+
     return *this;
   }
 
@@ -724,6 +743,7 @@ inline bool operator==(const TagInfo &LHS, const TagInfo &RHS) {
          LHS.SwiftRetainOp == RHS.SwiftRetainOp &&
          LHS.SwiftReleaseOp == RHS.SwiftReleaseOp &&
          LHS.isFlagEnum() == RHS.isFlagEnum() &&
+         LHS.isSwiftCopyable() == RHS.isSwiftCopyable() &&
          LHS.EnumExtensibility == RHS.EnumExtensibility;
 }
 
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index a662d94994ec..6dbd06251dda 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -2197,6 +2197,16 @@ public:
     return getQualifiedType(type.getUnqualifiedType(), Qs);
   }
 
+  /// \brief Return a type with the given __ptrauth qualifier.
+  QualType getPointerAuthType(QualType Ty, PointerAuthQualifier PointerAuth) {
+    assert(!Ty.getPointerAuth());
+    assert(PointerAuth);
+
+    Qualifiers Qs;
+    Qs.setPointerAuth(PointerAuth);
+    return getQualifiedType(Ty, Qs);
+  }
+
   unsigned char getFixedPointScale(QualType Ty) const;
   unsigned char getFixedPointIBits(QualType Ty) const;
   llvm::FixedPointSemantics getFixedPointSemantics(QualType Ty) const;
diff --git a/clang/include/clang/AST/ASTNodeTraverser.h b/clang/include/clang/AST/ASTNodeTraverser.h
index 216dc9eef08b..bf7c204e4ad7 100644
--- a/clang/include/clang/AST/ASTNodeTraverser.h
+++ b/clang/include/clang/AST/ASTNodeTraverser.h
@@ -844,6 +844,12 @@ public:
     }
   }
 
+  void VisitUnresolvedLookupExpr(const UnresolvedLookupExpr *E) {
+    if (E->hasExplicitTemplateArgs())
+      for (auto Arg : E->template_arguments())
+        Visit(Arg.getArgument());
+  }
+
   void VisitRequiresExpr(const RequiresExpr *E) {
     for (auto *D : E->getLocalParameters())
       Visit(D);
diff --git a/clang/include/clang/AST/AbstractBasicReader.h b/clang/include/clang/AST/AbstractBasicReader.h
index 1f2797cc7014..ab036f1d445a 100644
--- a/clang/include/clang/AST/AbstractBasicReader.h
+++ b/clang/include/clang/AST/AbstractBasicReader.h
@@ -213,9 +213,9 @@ public:
   }
 
   Qualifiers readQualifiers() {
-    static_assert(sizeof(Qualifiers().getAsOpaqueValue()) <= sizeof(uint32_t),
+    static_assert(sizeof(Qualifiers().getAsOpaqueValue()) <= sizeof(uint64_t),
                   "update this if the value size changes");
-    uint32_t value = asImpl().readUInt32();
+    uint64_t value = asImpl().readUInt64();
     return Qualifiers::fromOpaqueValue(value);
   }
 
diff --git a/clang/include/clang/AST/AbstractBasicWriter.h b/clang/include/clang/AST/AbstractBasicWriter.h
index 07afa388de2c..8e42fcaad1d3 100644
--- a/clang/include/clang/AST/AbstractBasicWriter.h
+++ b/clang/include/clang/AST/AbstractBasicWriter.h
@@ -196,9 +196,9 @@ public:
   }
 
   void writeQualifiers(Qualifiers value) {
-    static_assert(sizeof(value.getAsOpaqueValue()) <= sizeof(uint32_t),
+    static_assert(sizeof(value.getAsOpaqueValue()) <= sizeof(uint64_t),
                   "update this if the value size changes");
-    asImpl().writeUInt32(value.getAsOpaqueValue());
+    asImpl().writeUInt64(value.getAsOpaqueValue());
   }
 
   void writeExceptionSpecInfo(
diff --git a/clang/include/clang/AST/DeclContextInternals.h b/clang/include/clang/AST/DeclContextInternals.h
index c4734ab57895..e169c4859219 100644
--- a/clang/include/clang/AST/DeclContextInternals.h
+++ b/clang/include/clang/AST/DeclContextInternals.h
@@ -42,11 +42,12 @@ class StoredDeclsList {
   /// external declarations.
   DeclsAndHasExternalTy Data;
 
-  template<typename Fn>
-  void erase_if(Fn ShouldErase) {
+  template <typename Fn> DeclListNode::Decls *erase_if(Fn ShouldErase) {
     Decls List = Data.getPointer();
+
     if (!List)
-      return;
+      return nullptr;
+
     ASTContext &C = getASTContext();
     DeclListNode::Decls NewHead = nullptr;
     DeclListNode::Decls *NewLast = nullptr;
@@ -79,6 +80,17 @@ class StoredDeclsList {
     Data.setPointer(NewHead);
 
     assert(llvm::none_of(getLookupResult(), ShouldErase) && "Still exists!");
+
+    if (!Data.getPointer())
+      // All declarations are erased.
+      return nullptr;
+    else if (NewHead.is<NamedDecl *>())
+      // The list only contains a declaration, the header itself.
+      return (DeclListNode::Decls *)&Data;
+    else {
+      assert(NewLast && NewLast->is<NamedDecl *>() && "Not the tail?");
+      return NewLast;
+    }
   }
 
   void erase(NamedDecl *ND) {
@@ -160,12 +172,16 @@ public:
 
   void replaceExternalDecls(ArrayRef<NamedDecl*> Decls) {
     // Remove all declarations that are either external or are replaced with
-    // external declarations.
-    erase_if([Decls](NamedDecl *ND) {
+    // external declarations with higher visibilities.
+    DeclListNode::Decls *Tail = erase_if([Decls](NamedDecl *ND) {
       if (ND->isFromASTFile())
         return true;
+      // FIXME: Can we get rid of this loop completely?
       for (NamedDecl *D : Decls)
-        if (D->declarationReplaces(ND, /*IsKnownNewer=*/false))
+        // Only replace the local declaration if the external declaration has
+        // higher visibilities.
+        if (D->getModuleOwnershipKind() <= ND->getModuleOwnershipKind() &&
+            D->declarationReplaces(ND, /*IsKnownNewer=*/false))
           return true;
       return false;
     });
@@ -185,24 +201,15 @@ public:
       DeclsAsList = Node;
     }
 
-    DeclListNode::Decls Head = Data.getPointer();
-    if (Head.isNull()) {
+    if (!Data.getPointer()) {
       Data.setPointer(DeclsAsList);
       return;
     }
 
-    // Find the end of the existing list.
-    // FIXME: It would be possible to preserve information from erase_if to
-    // avoid this rescan looking for the end of the list.
-    DeclListNode::Decls *Tail = &Head;
-    while (DeclListNode *Node = Tail->dyn_cast<DeclListNode *>())
-      Tail = &Node->Rest;
-
     // Append the Decls.
     DeclListNode *Node = C.AllocateDeclListNode(Tail->get<NamedDecl *>());
     Node->Rest = DeclsAsList;
     *Tail = Node;
-    Data.setPointer(Head);
   }
 
   /// Return the list of all the decls.
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 99f45d518c79..e6643469e0b3 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -25,8 +25,10 @@
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/ExceptionSpecificationType.h"
 #include "clang/Basic/LLVM.h"
+#include "clang/Basic/LangOptions.h"
 #include "clang/Basic/Linkage.h"
 #include "clang/Basic/PartialDiagnostic.h"
+#include "clang/Basic/PointerAuthOptions.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/Specifiers.h"
 #include "clang/Basic/Visibility.h"
@@ -139,6 +141,174 @@ using CanQualType = CanQual<Type>;
 #define TYPE(Class, Base) class Class##Type;
 #include "clang/AST/TypeNodes.inc"
 
+/// Pointer-authentication qualifiers.
+class PointerAuthQualifier {
+  enum : uint32_t {
+    EnabledShift = 0,
+    EnabledBits = 1,
+    EnabledMask = 1 << EnabledShift,
+    AddressDiscriminatedShift = EnabledShift + EnabledBits,
+    AddressDiscriminatedBits = 1,
+    AddressDiscriminatedMask = 1 << AddressDiscriminatedShift,
+    AuthenticationModeShift =
+        AddressDiscriminatedShift + AddressDiscriminatedBits,
+    AuthenticationModeBits = 2,
+    AuthenticationModeMask = ((1 << AuthenticationModeBits) - 1)
+                             << AuthenticationModeShift,
+    IsaPointerShift = AuthenticationModeShift + AuthenticationModeBits,
+    IsaPointerBits = 1,
+    IsaPointerMask = ((1 << IsaPointerBits) - 1) << IsaPointerShift,
+    AuthenticatesNullValuesShift = IsaPointerShift + IsaPointerBits,
+    AuthenticatesNullValuesBits = 1,
+    AuthenticatesNullValuesMask = ((1 << AuthenticatesNullValuesBits) - 1)
+                                  << AuthenticatesNullValuesShift,
+    KeyShift = AuthenticatesNullValuesShift + AuthenticatesNullValuesBits,
+    KeyBits = 10,
+    KeyMask = ((1 << KeyBits) - 1) << KeyShift,
+    DiscriminatorShift = KeyShift + KeyBits,
+    DiscriminatorBits = 16,
+    DiscriminatorMask = ((1u << DiscriminatorBits) - 1) << DiscriminatorShift,
+  };
+
+  // bits:     |0      |1      |2..3              |4          |
+  //           |Enabled|Address|AuthenticationMode|ISA pointer|
+  // bits:     |5                |6..15|   16...31   |
+  //           |AuthenticatesNull|Key  |Discriminator|
+  uint32_t Data = 0;
+
+  // The following static assertions check that each of the 32 bits is present
+  // exactly in one of the constants.
+  static_assert((EnabledBits + AddressDiscriminatedBits +
+                 AuthenticationModeBits + IsaPointerBits +
+                 AuthenticatesNullValuesBits + KeyBits + DiscriminatorBits) ==
+                    32,
+                "PointerAuthQualifier should be exactly 32 bits");
+  static_assert((EnabledMask + AddressDiscriminatedMask +
+                 AuthenticationModeMask + IsaPointerMask +
+                 AuthenticatesNullValuesMask + KeyMask + DiscriminatorMask) ==
+                    0xFFFFFFFF,
+                "All masks should cover the entire bits");
+  static_assert((EnabledMask ^ AddressDiscriminatedMask ^
+                 AuthenticationModeMask ^ IsaPointerMask ^
+                 AuthenticatesNullValuesMask ^ KeyMask ^ DiscriminatorMask) ==
+                    0xFFFFFFFF,
+                "All masks should cover the entire bits");
+
+  PointerAuthQualifier(unsigned Key, bool IsAddressDiscriminated,
+                       unsigned ExtraDiscriminator,
+                       PointerAuthenticationMode AuthenticationMode,
+                       bool IsIsaPointer, bool AuthenticatesNullValues)
+      : Data(EnabledMask |
+             (IsAddressDiscriminated
+                  ? llvm::to_underlying(AddressDiscriminatedMask)
+                  : 0) |
+             (Key << KeyShift) |
+             (llvm::to_underlying(AuthenticationMode)
+              << AuthenticationModeShift) |
+             (ExtraDiscriminator << DiscriminatorShift) |
+             (IsIsaPointer << IsaPointerShift) |
+             (AuthenticatesNullValues << AuthenticatesNullValuesShift)) {
+    assert(Key <= KeyNoneInternal);
+    assert(ExtraDiscriminator <= MaxDiscriminator);
+    assert((Data == 0) ==
+           (getAuthenticationMode() == PointerAuthenticationMode::None));
+  }
+
+public:
+  enum {
+    KeyNoneInternal = (1u << KeyBits) - 1,
+
+    /// The maximum supported pointer-authentication key.
+    MaxKey = KeyNoneInternal - 1,
+
+    /// The maximum supported pointer-authentication discriminator.
+    MaxDiscriminator = (1u << DiscriminatorBits) - 1
+  };
+
+public:
+  PointerAuthQualifier() = default;
+
+  static PointerAuthQualifier
+  Create(unsigned Key, bool IsAddressDiscriminated, unsigned ExtraDiscriminator,
+         PointerAuthenticationMode AuthenticationMode, bool IsIsaPointer,
+         bool AuthenticatesNullValues) {
+    if (Key == PointerAuthKeyNone)
+      Key = KeyNoneInternal;
+    assert(Key <= KeyNoneInternal && "out-of-range key value");
+    return PointerAuthQualifier(Key, IsAddressDiscriminated, ExtraDiscriminator,
+                                AuthenticationMode, IsIsaPointer,
+                                AuthenticatesNullValues);
+  }
+
+  bool isPresent() const {
+    assert((Data == 0) ==
+           (getAuthenticationMode() == PointerAuthenticationMode::None));
+    return Data != 0;
+  }
+
+  explicit operator bool() const { return isPresent(); }
+
+  unsigned getKey() const {
+    assert(isPresent());
+    return (Data & KeyMask) >> KeyShift;
+  }
+
+  bool hasKeyNone() const { return isPresent() && getKey() == KeyNoneInternal; }
+
+  bool isAddressDiscriminated() const {
+    assert(isPresent());
+    return (Data & AddressDiscriminatedMask) >> AddressDiscriminatedShift;
+  }
+
+  unsigned getExtraDiscriminator() const {
+    assert(isPresent());
+    return (Data >> DiscriminatorShift);
+  }
+
+  PointerAuthenticationMode getAuthenticationMode() const {
+    return PointerAuthenticationMode((Data & AuthenticationModeMask) >>
+                                     AuthenticationModeShift);
+  }
+
+  bool isIsaPointer() const {
+    assert(isPresent());
+    return (Data & IsaPointerMask) >> IsaPointerShift;
+  }
+
+  bool authenticatesNullValues() const {
+    assert(isPresent());
+    return (Data & AuthenticatesNullValuesMask) >> AuthenticatesNullValuesShift;
+  }
+
+  PointerAuthQualifier withoutKeyNone() const {
+    return hasKeyNone() ? PointerAuthQualifier() : *this;
+  }
+
+  friend bool operator==(PointerAuthQualifier Lhs, PointerAuthQualifier Rhs) {
+    return Lhs.Data == Rhs.Data;
+  }
+  friend bool operator!=(PointerAuthQualifier Lhs, PointerAuthQualifier Rhs) {
+    return Lhs.Data != Rhs.Data;
+  }
+
+  bool isEquivalent(PointerAuthQualifier Other) const {
+    return withoutKeyNone() == Other.withoutKeyNone();
+  }
+
+  uint32_t getAsOpaqueValue() const { return Data; }
+
+  // Deserialize pointer-auth qualifiers from an opaque representation.
+  static PointerAuthQualifier fromOpaqueValue(uint32_t Opaque) {
+    PointerAuthQualifier Result;
+    Result.Data = Opaque;
+    assert((Result.Data == 0) ==
+           (Result.getAuthenticationMode() == PointerAuthenticationMode::None));
+    return Result;
+  }
+
+  void Profile(llvm::FoldingSetNodeID &ID) const { ID.AddInteger(Data); }
+};
+
 /// The collection of all-type qualifiers we support.
 /// Clang supports five independent qualifiers:
 /// * C99: const, volatile, and restrict
@@ -147,8 +317,9 @@ using CanQualType = CanQual<Type>;
 /// * Objective C: the GC attributes (none, weak, or strong)
 class Qualifiers {
 public:
-  enum TQ { // NOTE: These flags must be kept in sync with DeclSpec::TQ.
-    Const    = 0x1,
+  enum TQ : uint64_t {
+    // NOTE: These flags must be kept in sync with DeclSpec::TQ.
+    Const = 0x1,
     Restrict = 0x2,
     Volatile = 0x4,
     CVRMask = Const | Volatile | Restrict
@@ -182,7 +353,7 @@ public:
     OCL_Autoreleasing
   };
 
-  enum {
+  enum : uint64_t {
     /// The maximum supported address space number.
     /// 23 bits should be enough for anyone.
     MaxAddressSpace = 0x7fffffu,
@@ -197,16 +368,25 @@ public:
   /// Returns the common set of qualifiers while removing them from
   /// the given sets.
   static Qualifiers removeCommonQualifiers(Qualifiers &L, Qualifiers &R) {
+    Qualifiers Q;
+    PointerAuthQualifier LPtrAuth = L.getPointerAuth();
+    if (LPtrAuth.isPresent() &&
+        LPtrAuth.getKey() != PointerAuthQualifier::KeyNoneInternal &&
+        LPtrAuth == R.getPointerAuth()) {
+      Q.setPointerAuth(LPtrAuth);
+      PointerAuthQualifier Empty;
+      L.setPointerAuth(Empty);
+      R.setPointerAuth(Empty);
+    }
+
     // If both are only CVR-qualified, bit operations are sufficient.
     if (!(L.Mask & ~CVRMask) && !(R.Mask & ~CVRMask)) {
-      Qualifiers Q;
       Q.Mask = L.Mask & R.Mask;
       L.Mask &= ~Q.Mask;
       R.Mask &= ~Q.Mask;
       return Q;
     }
 
-    Qualifiers Q;
     unsigned CommonCRV = L.getCVRQualifiers() & R.getCVRQualifiers();
     Q.addCVRQualifiers(CommonCRV);
     L.removeCVRQualifiers(CommonCRV);
@@ -251,16 +431,14 @@ public:
   }
 
   // Deserialize qualifiers from an opaque representation.
-  static Qualifiers fromOpaqueValue(unsigned opaque) {
+  static Qualifiers fromOpaqueValue(uint64_t opaque) {
     Qualifiers Qs;
     Qs.Mask = opaque;
     return Qs;
   }
 
   // Serialize these qualifiers into an opaque representation.
-  unsigned getAsOpaqueValue() const {
-    return Mask;
-  }
+  uint64_t getAsOpaqueValue() const { return Mask; }
 
   bool hasConst() const { return Mask & Const; }
   bool hasOnlyConst() const { return Mask == Const; }
@@ -302,7 +480,7 @@ public:
   }
   void removeCVRQualifiers(unsigned mask) {
     assert(!(mask & ~CVRMask) && "bitmask contains non-CVR bits");
-    Mask &= ~mask;
+    Mask &= ~static_cast<uint64_t>(mask);
   }
   void removeCVRQualifiers() {
     removeCVRQualifiers(CVRMask);
@@ -407,6 +585,20 @@ public:
     setAddressSpace(space);
   }
 
+  bool hasPointerAuth() const { return Mask & PtrAuthMask; }
+  PointerAuthQualifier getPointerAuth() const {
+    return PointerAuthQualifier::fromOpaqueValue(Mask >> PtrAuthShift);
+  }
+  void setPointerAuth(PointerAuthQualifier Q) {
+    Mask = (Mask & ~PtrAuthMask) |
+           (uint64_t(Q.getAsOpaqueValue()) << PtrAuthShift);
+  }
+  void removePointerAuth() { Mask &= ~PtrAuthMask; }
+  void addPointerAuth(PointerAuthQualifier Q) {
+    assert(Q.isPresent());
+    setPointerAuth(Q);
+  }
+
   // Fast qualifiers are those that can be allocated directly
   // on a QualType object.
   bool hasFastQualifiers() const { return getFastQualifiers(); }
@@ -417,7 +609,7 @@ public:
   }
   void removeFastQualifiers(unsigned mask) {
     assert(!(mask & ~FastMask) && "bitmask contains non-fast qualifier bits");
-    Mask &= ~mask;
+    Mask &= ~static_cast<uint64_t>(mask);
   }
   void removeFastQualifiers() {
     removeFastQualifiers(FastMask);
@@ -454,6 +646,8 @@ public:
         addObjCGCAttr(Q.getObjCGCAttr());
       if (Q.hasObjCLifetime())
         addObjCLifetime(Q.getObjCLifetime());
+      if (Q.hasPointerAuth())
+        addPointerAuth(Q.getPointerAuth());
     }
   }
 
@@ -471,6 +665,8 @@ public:
         removeObjCLifetime();
       if (getAddressSpace() == Q.getAddressSpace())
         removeAddressSpace();
+      if (getPointerAuth() == Q.getPointerAuth())
+        removePointerAuth();
     }
   }
 
@@ -483,6 +679,8 @@ public:
            !hasObjCGCAttr() || !qs.hasObjCGCAttr());
     assert(getObjCLifetime() == qs.getObjCLifetime() ||
            !hasObjCLifetime() || !qs.hasObjCLifetime());
+    assert(!hasPointerAuth() || !qs.hasPointerAuth() ||
+           getPointerAuth() == qs.getPointerAuth());
     Mask |= qs.Mask;
   }
 
@@ -536,6 +734,8 @@ public:
            // be changed.
            (getObjCGCAttr() == other.getObjCGCAttr() || !hasObjCGCAttr() ||
             !other.hasObjCGCAttr()) &&
+           // Pointer-auth qualifiers must match exactly.
+           getPointerAuth() == other.getPointerAuth() &&
            // ObjC lifetime qualifiers must match exactly.
            getObjCLifetime() == other.getObjCLifetime() &&
            // CVR qualifiers may subset.
@@ -605,24 +805,26 @@ public:
   void print(raw_ostream &OS, const PrintingPolicy &Policy,
              bool appendSpaceIfNonEmpty = false) const;
 
-  void Profile(llvm::FoldingSetNodeID &ID) const {
-    ID.AddInteger(Mask);
-  }
+  void Profile(llvm::FoldingSetNodeID &ID) const { ID.AddInteger(Mask); }
 
 private:
-  // bits:     |0 1 2|3|4 .. 5|6  ..  8|9   ...   31|
-  //           |C R V|U|GCAttr|Lifetime|AddressSpace|
-  uint32_t Mask = 0;
-
-  static const uint32_t UMask = 0x8;
-  static const uint32_t UShift = 3;
-  static const uint32_t GCAttrMask = 0x30;
-  static const uint32_t GCAttrShift = 4;
-  static const uint32_t LifetimeMask = 0x1C0;
-  static const uint32_t LifetimeShift = 6;
-  static const uint32_t AddressSpaceMask =
+  // bits:     |0 1 2|3|4 .. 5|6  ..  8|9   ...   31|32 ... 63|
+  //           |C R V|U|GCAttr|Lifetime|AddressSpace| PtrAuth |
+  uint64_t Mask = 0;
+  static_assert(sizeof(PointerAuthQualifier) == sizeof(uint32_t),
+                "PointerAuthQualifier must be 32 bits");
+
+  static constexpr uint64_t UMask = 0x8;
+  static constexpr uint64_t UShift = 3;
+  static constexpr uint64_t GCAttrMask = 0x30;
+  static constexpr uint64_t GCAttrShift = 4;
+  static constexpr uint64_t LifetimeMask = 0x1C0;
+  static constexpr uint64_t LifetimeShift = 6;
+  static constexpr uint64_t AddressSpaceMask =
       ~(CVRMask | UMask | GCAttrMask | LifetimeMask);
-  static const uint32_t AddressSpaceShift = 9;
+  static constexpr uint64_t AddressSpaceShift = 9;
+  static constexpr uint64_t PtrAuthShift = 32;
+  static constexpr uint64_t PtrAuthMask = uint64_t(0xffffffff) << PtrAuthShift;
 };
 
 class QualifiersAndAtomic {
@@ -1242,6 +1444,10 @@ public:
   // true when Type is objc's weak and weak is enabled but ARC isn't.
   bool isNonWeakInMRRWithObjCWeak(const ASTContext &Context) const;
 
+  PointerAuthQualifier getPointerAuth() const {
+    return getQualifiers().getPointerAuth();
+  }
+
   enum PrimitiveDefaultInitializeKind {
     /// The type does not fall into any of the following categories. Note that
     /// this case is zero-valued so that values of this enum can be used as a
@@ -2172,6 +2378,10 @@ public:
   /// 'riscv_rvv_vector_bits' type attribute as VectorType.
   QualType getRVVEltType(const ASTContext &Ctx) const;
 
+  /// Returns the representative type for the element of a sizeless vector
+  /// builtin type.
+  QualType getSizelessVectorEltType(const ASTContext &Ctx) const;
+
   /// Types are partitioned into 3 broad categories (C99 6.2.5p1):
   /// object types, function types, and incomplete types.
 
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 4408d517e70e..97e06fe7d2e6 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -3211,7 +3211,7 @@ def ObjCRequiresPropertyDefs : InheritableAttr {
 def Unused : InheritableAttr {
   let Spellings = [CXX11<"", "maybe_unused", 201603>, GCC<"unused">,
                    C23<"", "maybe_unused", 202106>];
-  let Subjects = SubjectList<[Var, ObjCIvar, Type, Enum, EnumConstant, Label,
+  let Subjects = SubjectList<[Var, Binding, ObjCIvar, Type, Enum, EnumConstant, Label,
                               Field, ObjCMethod, FunctionLike]>;
   let Documentation = [WarnMaybeUnusedDocs];
 }
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 38174cf3549f..fdffb35ea0d9 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -478,6 +478,15 @@ def ext_decomp_decl_empty : ExtWarn<
   "ISO C++17 does not allow a decomposition group to be empty">,
   InGroup<DiagGroup<"empty-decomposition">>;
 
+// C++26 structured bindings
+def ext_decl_attrs_on_binding : ExtWarn<
+  "an attribute specifier sequence attached to a structured binding declaration "
+  "is a C++2c extension">, InGroup<CXX26>;
+def warn_cxx23_compat_decl_attrs_on_binding : Warning<
+  "an attribute specifier sequence attached to a structured binding declaration "
+  "is incompatible with C++ standards before C++2c">,
+  InGroup<CXXPre26Compat>, DefaultIgnore;
+
 /// Objective-C parser diagnostics
 def err_expected_minus_or_plus : Error<
   "method type specifier must start with '-' or '+'">;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index fdca82934cb4..f72d5c252b86 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9901,6 +9901,9 @@ def warn_format_invalid_annotation : Warning<
 def warn_format_P_no_precision : Warning<
   "using '%%P' format specifier without precision">,
   InGroup<Format>;
+def warn_format_P_with_objc_pointer : Warning<
+  "using '%%P' format specifier with an Objective-C pointer results in dumping runtime object structure, not object value">,
+  InGroup<Format>;
 def warn_printf_ignored_flag: Warning<
   "flag '%0' is ignored when flag '%1' is present">,
   InGroup<Format>;
@@ -9950,6 +9953,8 @@ def warn_ret_stack_addr_ref : Warning<
 def warn_ret_local_temp_addr_ref : Warning<
   "returning %select{address of|reference to}0 local temporary object">,
   InGroup<ReturnStackAddress>;
+def err_ret_local_temp_ref : Error<
+  "returning reference to local temporary object">;
 def warn_ret_addr_label : Warning<
   "returning address of label, which is local">,
   InGroup<ReturnStackAddress>;
@@ -10328,9 +10333,13 @@ def err_shufflevector_nonconstant_argument : Error<
 def err_shufflevector_argument_too_large : Error<
   "index for __builtin_shufflevector must be less than the total number "
   "of vector elements">;
+def err_shufflevector_minus_one_is_undefined_behavior_constexpr : Error<
+  "index for __builtin_shufflevector not within the bounds of the input vectors; index of -1 found at position %0 not permitted in a constexpr context.">;
 
 def err_convertvector_non_vector : Error<
   "first argument to __builtin_convertvector must be a vector">;
+def err_convertvector_constexpr_unsupported_vector_cast : Error<
+  "unsupported vector cast from %0 to %1 in a constant expression.">;
 def err_builtin_non_vector_type : Error<
   "%0 argument to %1 must be of vector type">;
 def err_convertvector_incompatible_vector : Error<
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index ae4715921d16..e2a2aa71b880 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -57,6 +57,13 @@ enum class ShaderStage {
   Invalid,
 };
 
+enum class PointerAuthenticationMode : unsigned {
+  None,
+  Strip,
+  SignAndStrip,
+  SignAndAuth
+};
+
 /// Bitfields of LangOptions, split out from LangOptions in order to ensure that
 /// this large collection of bitfields is a trivial class type.
 class LangOptionsBase {
diff --git a/clang/include/clang/Basic/PointerAuthOptions.h b/clang/include/clang/Basic/PointerAuthOptions.h
new file mode 100644
index 000000000000..e5cdcc31ebfb
--- /dev/null
+++ b/clang/include/clang/Basic/PointerAuthOptions.h
@@ -0,0 +1,23 @@
+//===--- PointerAuthOptions.h -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines options for configuring pointer-auth technologies
+//  like ARMv8.3.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_BASIC_POINTERAUTHOPTIONS_H
+#define LLVM_CLANG_BASIC_POINTERAUTHOPTIONS_H
+
+namespace clang {
+
+constexpr unsigned PointerAuthKeyNone = -1;
+
+} // end namespace clang
+
+#endif
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index e1ef7454f016..3ced2e7397a7 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -40,6 +40,7 @@
 #include <cassert>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 namespace llvm {
@@ -1792,6 +1793,15 @@ public:
   /// Whether to support HIP image/texture API's.
   virtual bool hasHIPImageSupport() const { return true; }
 
+  /// The first value in the pair is the minimum offset between two objects to
+  /// avoid false sharing (destructive interference). The second value in the
+  /// pair is maximum size of contiguous memory to promote true sharing
+  /// (constructive interference). Neither of these values are considered part
+  /// of the ABI and can be changed by targets at any time.
+  virtual std::pair<unsigned, unsigned> hardwareInterferenceSizes() const {
+    return std::make_pair(64, 64);
+  }
+
 protected:
   /// Copy type and layout related info.
   void copyAuxTarget(const TargetInfo *Aux);
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 6d655c39360d..6390ba3f9fe5 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -275,7 +275,7 @@ def OP_VCVT_BF16_F32_HI_A32
                            (call "vget_low", $p0))>;
 
 def OP_CVT_F32_BF16
-    : Op<(bitcast "R", (op "<<", (bitcast "int32_t", $p0),
+    : Op<(bitcast "R", (op "<<", (cast "int32_t", (bitcast "int16_t", $p0)),
                                  (literal "int32_t", "16")))>;
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 6cc249837d3f..15340ebb62b3 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1961,19 +1961,20 @@ def SVPSEL_D : SInst<"svpsel_lane_b64", "PPPm", "Pl", MergeNone, "", [IsStreamin
 
 // Standalone sve2.1 builtins
 let TargetGuard = "sve2p1" in {
-def SVORQV   : SInst<"svorqv[_{d}]", "{Pd", "csilUcUsUiUl", MergeNone, "aarch64_sve_orqv", [IsReductionQV]>;
-def SVEORQV  : SInst<"sveorqv[_{d}]", "{Pd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorqv", [IsReductionQV]>;
-def SVADDQV  : SInst<"svaddqv[_{d}]", "{Pd", "hfdcsilUcUsUiUl", MergeNone, "aarch64_sve_addqv", [IsReductionQV]>;
-def SVANDQV  : SInst<"svandqv[_{d}]", "{Pd", "csilUcUsUiUl", MergeNone, "aarch64_sve_andqv", [IsReductionQV]>;
-def SVSMAXQV : SInst<"svmaxqv[_{d}]", "{Pd", "csil", MergeNone, "aarch64_sve_smaxqv", [IsReductionQV]>;
-def SVUMAXQV : SInst<"svmaxqv[_{d}]", "{Pd", "UcUsUiUl", MergeNone, "aarch64_sve_umaxqv", [IsReductionQV]>;
-def SVSMINQV : SInst<"svminqv[_{d}]", "{Pd", "csil", MergeNone, "aarch64_sve_sminqv", [IsReductionQV]>;
-def SVUMINQV : SInst<"svminqv[_{d}]", "{Pd", "UcUsUiUl", MergeNone, "aarch64_sve_uminqv", [IsReductionQV]>;
-
-def SVFMAXNMQV: SInst<"svmaxnmqv[_{d}]", "{Pd", "hfd", MergeNone, "aarch64_sve_fmaxnmqv", [IsReductionQV]>;
-def SVFMINNMQV: SInst<"svminnmqv[_{d}]", "{Pd", "hfd", MergeNone, "aarch64_sve_fminnmqv", [IsReductionQV]>;
-def SVFMAXQV: SInst<"svmaxqv[_{d}]", "{Pd", "hfd", MergeNone, "aarch64_sve_fmaxqv", [IsReductionQV]>;
-def SVFMINQV: SInst<"svminqv[_{d}]", "{Pd", "hfd", MergeNone, "aarch64_sve_fminqv", [IsReductionQV]>;
+def SVORQV   : SInst<"svorqv[_{d}]",  "{Pd", "csilUcUsUiUl", MergeNone, "aarch64_sve_orqv",   [IsReductionQV]>;
+def SVEORQV  : SInst<"sveorqv[_{d}]", "{Pd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorqv",  [IsReductionQV]>;
+def SVADDQV  : SInst<"svaddqv[_{d}]", "{Pd", "csilUcUsUiUl", MergeNone, "aarch64_sve_addqv",  [IsReductionQV]>;
+def SVANDQV  : SInst<"svandqv[_{d}]", "{Pd", "csilUcUsUiUl", MergeNone, "aarch64_sve_andqv",  [IsReductionQV]>;
+def SVSMAXQV : SInst<"svmaxqv[_{d}]", "{Pd", "csil",         MergeNone, "aarch64_sve_smaxqv", [IsReductionQV]>;
+def SVUMAXQV : SInst<"svmaxqv[_{d}]", "{Pd", "UcUsUiUl",     MergeNone, "aarch64_sve_umaxqv", [IsReductionQV]>;
+def SVSMINQV : SInst<"svminqv[_{d}]", "{Pd", "csil",         MergeNone, "aarch64_sve_sminqv", [IsReductionQV]>;
+def SVUMINQV : SInst<"svminqv[_{d}]", "{Pd", "UcUsUiUl",     MergeNone, "aarch64_sve_uminqv", [IsReductionQV]>;
+
+def SVFADDQV   : SInst<"svaddqv[_{d}]",   "{Pd", "hfd", MergeNone, "aarch64_sve_faddqv",   [IsReductionQV]>;
+def SVFMAXNMQV : SInst<"svmaxnmqv[_{d}]", "{Pd", "hfd", MergeNone, "aarch64_sve_fmaxnmqv", [IsReductionQV]>;
+def SVFMINNMQV : SInst<"svminnmqv[_{d}]", "{Pd", "hfd", MergeNone, "aarch64_sve_fminnmqv", [IsReductionQV]>;
+def SVFMAXQV   : SInst<"svmaxqv[_{d}]",   "{Pd", "hfd", MergeNone, "aarch64_sve_fmaxqv",   [IsReductionQV]>;
+def SVFMINQV   : SInst<"svminqv[_{d}]",   "{Pd", "hfd", MergeNone, "aarch64_sve_fminqv",   [IsReductionQV]>;
 }
 
 let TargetGuard = "sve2p1|sme2" in {
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 4cb0b840df87..263d1edf141a 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4881,6 +4881,8 @@ def msimd128 : Flag<["-"], "msimd128">, Group<m_wasm_Features_Group>;
 def mno_simd128 : Flag<["-"], "mno-simd128">, Group<m_wasm_Features_Group>;
 def mrelaxed_simd : Flag<["-"], "mrelaxed-simd">, Group<m_wasm_Features_Group>;
 def mno_relaxed_simd : Flag<["-"], "mno-relaxed-simd">, Group<m_wasm_Features_Group>;
+def mhalf_precision : Flag<["-"], "mhalf-precision">, Group<m_wasm_Features_Group>;
+def mno_half_precision : Flag<["-"], "mno-half-precision">, Group<m_wasm_Features_Group>;
 def mnontrapping_fptoint : Flag<["-"], "mnontrapping-fptoint">, Group<m_wasm_Features_Group>;
 def mno_nontrapping_fptoint : Flag<["-"], "mno-nontrapping-fptoint">, Group<m_wasm_Features_Group>;
 def msign_ext : Flag<["-"], "msign-ext">, Group<m_wasm_Features_Group>;
@@ -6587,12 +6589,6 @@ def J : JoinedOrSeparate<["-"], "J">,
   Group<gfortran_Group>,
   Alias<module_dir>;
 
-let Visibility = [FlangOption] in {
-def no_fortran_main : Flag<["-"], "fno-fortran-main">,
-  Visibility<[FlangOption]>, Group<f_Group>,
-  HelpText<"Do not include Fortran_main.a (provided by Flang) when linking">;
-} // let Visibility = [ FlangOption ]
-
 //===----------------------------------------------------------------------===//
 // FC1 Options
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Sema/DeclSpec.h b/clang/include/clang/Sema/DeclSpec.h
index c9eecdafe62c..760c7980be52 100644
--- a/clang/include/clang/Sema/DeclSpec.h
+++ b/clang/include/clang/Sema/DeclSpec.h
@@ -36,6 +36,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <optional>
 
 namespace clang {
   class ASTContext;
@@ -1790,6 +1791,7 @@ public:
   struct Binding {
     IdentifierInfo *Name;
     SourceLocation NameLoc;
+    std::optional<ParsedAttributes> Attrs;
   };
 
 private:
@@ -2339,10 +2341,10 @@ public:
   }
 
   /// Set the decomposition bindings for this declarator.
-  void
-  setDecompositionBindings(SourceLocation LSquareLoc,
-                           ArrayRef<DecompositionDeclarator::Binding> Bindings,
-                           SourceLocation RSquareLoc);
+  void setDecompositionBindings(
+      SourceLocation LSquareLoc,
+      MutableArrayRef<DecompositionDeclarator::Binding> Bindings,
+      SourceLocation RSquareLoc);
 
   /// AddTypeInfo - Add a chunk to this declarator. Also extend the range to
   /// EndLoc, which should be the last token of the chunk.
diff --git a/clang/include/clang/Sema/Lookup.h b/clang/include/clang/Sema/Lookup.h
index b0a08a05ac6a..0db5b847038f 100644
--- a/clang/include/clang/Sema/Lookup.h
+++ b/clang/include/clang/Sema/Lookup.h
@@ -499,9 +499,7 @@ public:
   /// Note that while no result was found in the current instantiation,
   /// there were dependent base classes that could not be searched.
   void setNotFoundInCurrentInstantiation() {
-    assert((ResultKind == NotFound ||
-            ResultKind == NotFoundInCurrentInstantiation) &&
-           Decls.empty());
+    assert(ResultKind == NotFound && Decls.empty());
     ResultKind = NotFoundInCurrentInstantiation;
   }
 
diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h
index 25a5fa05b21c..8368d9ce6146 100644
--- a/clang/include/clang/Sema/ParsedAttr.h
+++ b/clang/include/clang/Sema/ParsedAttr.h
@@ -948,6 +948,7 @@ public:
   ParsedAttributes(AttributeFactory &factory) : pool(factory) {}
   ParsedAttributes(const ParsedAttributes &) = delete;
   ParsedAttributes &operator=(const ParsedAttributes &) = delete;
+  ParsedAttributes(ParsedAttributes &&G) = default;
 
   AttributePool &getPool() const { return pool; }
 
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index aa182b15e66e..1ca523ec88c2 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -7472,7 +7472,7 @@ public:
   bool LookupQualifiedName(LookupResult &R, DeclContext *LookupCtx,
                            CXXScopeSpec &SS);
   bool LookupParsedName(LookupResult &R, Scope *S, CXXScopeSpec *SS,
-                        QualType ObjectType, bool AllowBuiltinCreation = false,
+                        bool AllowBuiltinCreation = false,
                         bool EnteringContext = false);
   ObjCProtocolDecl *LookupProtocol(
       IdentifierInfo *II, SourceLocation IdLoc,
@@ -8881,13 +8881,11 @@ public:
     /// functions (but no function templates).
     FoundFunctions,
   };
-
-  bool
-  LookupTemplateName(LookupResult &R, Scope *S, CXXScopeSpec &SS,
-                     QualType ObjectType, bool EnteringContext,
-                     RequiredTemplateKind RequiredTemplate = SourceLocation(),
-                     AssumedTemplateKind *ATK = nullptr,
-                     bool AllowTypoCorrection = true);
+  bool LookupTemplateName(
+      LookupResult &R, Scope *S, CXXScopeSpec &SS, QualType ObjectType,
+      bool EnteringContext, bool &MemberOfUnknownSpecialization,
+      RequiredTemplateKind RequiredTemplate = SourceLocation(),
+      AssumedTemplateKind *ATK = nullptr, bool AllowTypoCorrection = true);
 
   TemplateNameKind isTemplateName(Scope *S, CXXScopeSpec &SS,
                                   bool hasTemplateKeyword,
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
index fac0c04ae2ca..ef23b160a3c0 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
@@ -225,15 +225,11 @@ public:
   ///   invalidated. This should include any regions explicitly invalidated
   ///   even if they do not currently have bindings. Pass \c NULL if this
   ///   information will not be used.
-  virtual StoreRef invalidateRegions(Store store,
-                                  ArrayRef<SVal> Values,
-                                  const Expr *E, unsigned Count,
-                                  const LocationContext *LCtx,
-                                  const CallEvent *Call,
-                                  InvalidatedSymbols &IS,
-                                  RegionAndSymbolInvalidationTraits &ITraits,
-                                  InvalidatedRegions *InvalidatedTopLevel,
-                                  InvalidatedRegions *Invalidated) = 0;
+  virtual StoreRef invalidateRegions(
+      Store store, ArrayRef<SVal> Values, const Expr *Ex, unsigned Count,
+      const LocationContext *LCtx, const CallEvent *Call,
+      InvalidatedSymbols &IS, RegionAndSymbolInvalidationTraits &ITraits,
+      InvalidatedRegions *TopLevelRegions, InvalidatedRegions *Invalidated) = 0;
 
   /// enterStackFrame - Let the StoreManager to do something when execution
   /// engine is about to execute into a callee.
diff --git a/clang/lib/APINotes/APINotesFormat.h b/clang/lib/APINotes/APINotesFormat.h
index 615314c46f09..97e630e97fdc 100644
--- a/clang/lib/APINotes/APINotesFormat.h
+++ b/clang/lib/APINotes/APINotesFormat.h
@@ -24,7 +24,10 @@ const uint16_t VERSION_MAJOR = 0;
 /// API notes file minor version number.
 ///
 /// When the format changes IN ANY WAY, this number should be incremented.
-const uint16_t VERSION_MINOR = 25; // SwiftImportAs
+const uint16_t VERSION_MINOR = 26; // SwiftCopyable
+
+const uint8_t kSwiftCopyable = 1;
+const uint8_t kSwiftNonCopyable = 2;
 
 using IdentifierID = llvm::PointerEmbeddedInt<unsigned, 31>;
 using IdentifierIDField = llvm::BCVBR<16>;
diff --git a/clang/lib/APINotes/APINotesReader.cpp b/clang/lib/APINotes/APINotesReader.cpp
index dfc3beb6fa13..b60ca685f62c 100644
--- a/clang/lib/APINotes/APINotesReader.cpp
+++ b/clang/lib/APINotes/APINotesReader.cpp
@@ -527,6 +527,13 @@ public:
       Info.EnumExtensibility =
           static_cast<EnumExtensibilityKind>((Payload & 0x3) - 1);
 
+    uint8_t Copyable =
+        endian::readNext<uint8_t, llvm::endianness::little>(Data);
+    if (Copyable == kSwiftNonCopyable)
+      Info.setSwiftCopyable(std::optional(false));
+    else if (Copyable == kSwiftCopyable)
+      Info.setSwiftCopyable(std::optional(true));
+
     unsigned ImportAsLength =
         endian::readNext<uint16_t, llvm::endianness::little>(Data);
     if (ImportAsLength > 0) {
diff --git a/clang/lib/APINotes/APINotesWriter.cpp b/clang/lib/APINotes/APINotesWriter.cpp
index e3f5d102fcd0..3e6159763150 100644
--- a/clang/lib/APINotes/APINotesWriter.cpp
+++ b/clang/lib/APINotes/APINotesWriter.cpp
@@ -1128,7 +1128,7 @@ public:
     return 2 + (TI.SwiftImportAs ? TI.SwiftImportAs->size() : 0) +
            2 + (TI.SwiftRetainOp ? TI.SwiftRetainOp->size() : 0) +
            2 + (TI.SwiftReleaseOp ? TI.SwiftReleaseOp->size() : 0) +
-           1 + getCommonTypeInfoSize(TI);
+           2 + getCommonTypeInfoSize(TI);
   }
 
   void emitUnversionedInfo(raw_ostream &OS, const TagInfo &TI) {
@@ -1146,6 +1146,11 @@ public:
 
     writer.write<uint8_t>(Flags);
 
+    if (auto Copyable = TI.isSwiftCopyable())
+      writer.write<uint8_t>(*Copyable ? kSwiftCopyable : kSwiftNonCopyable);
+    else
+      writer.write<uint8_t>(0);
+
     if (auto ImportAs = TI.SwiftImportAs) {
       writer.write<uint16_t>(ImportAs->size() + 1);
       OS.write(ImportAs->c_str(), ImportAs->size());
diff --git a/clang/lib/APINotes/APINotesYAMLCompiler.cpp b/clang/lib/APINotes/APINotesYAMLCompiler.cpp
index 57d6da7a1775..2295d769d344 100644
--- a/clang/lib/APINotes/APINotesYAMLCompiler.cpp
+++ b/clang/lib/APINotes/APINotesYAMLCompiler.cpp
@@ -419,6 +419,7 @@ struct Tag {
   std::optional<EnumExtensibilityKind> EnumExtensibility;
   std::optional<bool> FlagEnum;
   std::optional<EnumConvenienceAliasKind> EnumConvenienceKind;
+  std::optional<bool> SwiftCopyable;
 };
 
 typedef std::vector<Tag> TagsSeq;
@@ -452,6 +453,7 @@ template <> struct MappingTraits<Tag> {
     IO.mapOptional("EnumExtensibility", T.EnumExtensibility);
     IO.mapOptional("FlagEnum", T.FlagEnum);
     IO.mapOptional("EnumKind", T.EnumConvenienceKind);
+    IO.mapOptional("SwiftCopyable", T.SwiftCopyable);
   }
 };
 } // namespace yaml
@@ -1009,6 +1011,9 @@ public:
       if (Tag.SwiftReleaseOp)
         TI.SwiftReleaseOp = Tag.SwiftReleaseOp;
 
+      if (Tag.SwiftCopyable)
+        TI.setSwiftCopyable(Tag.SwiftCopyable);
+
       if (Tag.EnumConvenienceKind) {
         if (Tag.EnumExtensibility) {
           emitError(
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index c33babf8d1df..f341c74cf86e 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -1115,7 +1115,9 @@ int64_t Decl::getID() const {
 
 const FunctionType *Decl::getFunctionType(bool BlocksToo) const {
   QualType Ty;
-  if (const auto *D = dyn_cast<ValueDecl>(this))
+  if (const auto *D = dyn_cast<BindingDecl>(this))
+    return nullptr;
+  else if (const auto *D = dyn_cast<ValueDecl>(this))
     Ty = D->getType();
   else if (const auto *D = dyn_cast<TypedefNameDecl>(this))
     Ty = D->getUnderlyingType();
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index d2e40be59d6f..63dcdb919c71 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -103,7 +103,7 @@ const Expr *Expr::skipRValueSubobjectAdjustments(
       }
     } else if (const auto *ME = dyn_cast<MemberExpr>(E)) {
       if (!ME->isArrow()) {
-        assert(ME->getBase()->getType()->getAsRecordDecl());
+        assert(ME->getBase()->getType()->isRecordType());
         if (const auto *Field = dyn_cast<FieldDecl>(ME->getMemberDecl())) {
           if (!Field->isBitField() && !Field->getType()->isReferenceType()) {
             E = ME->getBase();
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index ea3e7304a742..f1aa19e4409e 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -2706,7 +2706,11 @@ static bool checkFloatingPointResult(EvalInfo &Info, const Expr *E,
 static bool HandleFloatToFloatCast(EvalInfo &Info, const Expr *E,
                                    QualType SrcType, QualType DestType,
                                    APFloat &Result) {
-  assert(isa<CastExpr>(E) || isa<CompoundAssignOperator>(E));
+  assert((isa<CastExpr>(E) || isa<CompoundAssignOperator>(E) ||
+          isa<ConvertVectorExpr>(E)) &&
+         "HandleFloatToFloatCast has been checked with only CastExpr, "
+         "CompoundAssignOperator and ConvertVectorExpr. Please either validate "
+         "the new expression or address the root cause of this usage.");
   llvm::RoundingMode RM = getActiveRoundingMode(Info, E);
   APFloat::opStatus St;
   APFloat Value = Result;
@@ -9237,9 +9241,10 @@ bool PointerExprEvaluator::VisitCastExpr(const CastExpr *E) {
       bool HasValidResult = !Result.InvalidBase && !Result.Designator.Invalid &&
                             !Result.IsNullPtr;
       bool VoidPtrCastMaybeOK =
-          HasValidResult &&
-          Info.Ctx.hasSameUnqualifiedType(Result.Designator.getType(Info.Ctx),
-                                          E->getType()->getPointeeType());
+          Result.IsNullPtr ||
+          (HasValidResult &&
+           Info.Ctx.hasSimilarType(Result.Designator.getType(Info.Ctx),
+                                   E->getType()->getPointeeType()));
       // 1. We'll allow it in std::allocator::allocate, and anything which that
       //    calls.
       // 2. HACK 2022-03-28: Work around an issue with libstdc++'s
@@ -10709,8 +10714,11 @@ namespace {
     bool VisitUnaryImag(const UnaryOperator *E);
     bool VisitBinaryOperator(const BinaryOperator *E);
     bool VisitUnaryOperator(const UnaryOperator *E);
+    bool VisitConvertVectorExpr(const ConvertVectorExpr *E);
+    bool VisitShuffleVectorExpr(const ShuffleVectorExpr *E);
+
     // FIXME: Missing: conditional operator (for GNU
-    //                 conditional select), shufflevector, ExtVectorElementExpr
+    //                 conditional select), ExtVectorElementExpr
   };
 } // end anonymous namespace
 
@@ -10961,6 +10969,122 @@ bool VectorExprEvaluator::VisitUnaryOperator(const UnaryOperator *E) {
   return Success(APValue(ResultElements.data(), ResultElements.size()), E);
 }
 
+static bool handleVectorElementCast(EvalInfo &Info, const FPOptions FPO,
+                                    const Expr *E, QualType SourceTy,
+                                    QualType DestTy, APValue const &Original,
+                                    APValue &Result) {
+  if (SourceTy->isIntegerType()) {
+    if (DestTy->isRealFloatingType()) {
+      Result = APValue(APFloat(0.0));
+      return HandleIntToFloatCast(Info, E, FPO, SourceTy, Original.getInt(),
+                                  DestTy, Result.getFloat());
+    }
+    if (DestTy->isIntegerType()) {
+      Result = APValue(
+          HandleIntToIntCast(Info, E, DestTy, SourceTy, Original.getInt()));
+      return true;
+    }
+  } else if (SourceTy->isRealFloatingType()) {
+    if (DestTy->isRealFloatingType()) {
+      Result = Original;
+      return HandleFloatToFloatCast(Info, E, SourceTy, DestTy,
+                                    Result.getFloat());
+    }
+    if (DestTy->isIntegerType()) {
+      Result = APValue(APSInt());
+      return HandleFloatToIntCast(Info, E, SourceTy, Original.getFloat(),
+                                  DestTy, Result.getInt());
+    }
+  }
+
+  Info.FFDiag(E, diag::err_convertvector_constexpr_unsupported_vector_cast)
+      << SourceTy << DestTy;
+  return false;
+}
+
+bool VectorExprEvaluator::VisitConvertVectorExpr(const ConvertVectorExpr *E) {
+  APValue Source;
+  QualType SourceVecType = E->getSrcExpr()->getType();
+  if (!EvaluateAsRValue(Info, E->getSrcExpr(), Source))
+    return false;
+
+  QualType DestTy = E->getType()->castAs<VectorType>()->getElementType();
+  QualType SourceTy = SourceVecType->castAs<VectorType>()->getElementType();
+
+  const FPOptions FPO = E->getFPFeaturesInEffect(Info.Ctx.getLangOpts());
+
+  auto SourceLen = Source.getVectorLength();
+  SmallVector<APValue, 4> ResultElements;
+  ResultElements.reserve(SourceLen);
+  for (unsigned EltNum = 0; EltNum < SourceLen; ++EltNum) {
+    APValue Elt;
+    if (!handleVectorElementCast(Info, FPO, E, SourceTy, DestTy,
+                                 Source.getVectorElt(EltNum), Elt))
+      return false;
+    ResultElements.push_back(std::move(Elt));
+  }
+
+  return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+}
+
+static bool handleVectorShuffle(EvalInfo &Info, const ShuffleVectorExpr *E,
+                                QualType ElemType, APValue const &VecVal1,
+                                APValue const &VecVal2, unsigned EltNum,
+                                APValue &Result) {
+  unsigned const TotalElementsInInputVector1 = VecVal1.getVectorLength();
+  unsigned const TotalElementsInInputVector2 = VecVal2.getVectorLength();
+
+  APSInt IndexVal = E->getShuffleMaskIdx(Info.Ctx, EltNum);
+  int64_t index = IndexVal.getExtValue();
+  // The spec says that -1 should be treated as undef for optimizations,
+  // but in constexpr we'd have to produce an APValue::Indeterminate,
+  // which is prohibited from being a top-level constant value. Emit a
+  // diagnostic instead.
+  if (index == -1) {
+    Info.FFDiag(
+        E, diag::err_shufflevector_minus_one_is_undefined_behavior_constexpr)
+        << EltNum;
+    return false;
+  }
+
+  if (index < 0 ||
+      index >= TotalElementsInInputVector1 + TotalElementsInInputVector2)
+    llvm_unreachable("Out of bounds shuffle index");
+
+  if (index >= TotalElementsInInputVector1)
+    Result = VecVal2.getVectorElt(index - TotalElementsInInputVector1);
+  else
+    Result = VecVal1.getVectorElt(index);
+  return true;
+}
+
+bool VectorExprEvaluator::VisitShuffleVectorExpr(const ShuffleVectorExpr *E) {
+  APValue VecVal1;
+  const Expr *Vec1 = E->getExpr(0);
+  if (!EvaluateAsRValue(Info, Vec1, VecVal1))
+    return false;
+  APValue VecVal2;
+  const Expr *Vec2 = E->getExpr(1);
+  if (!EvaluateAsRValue(Info, Vec2, VecVal2))
+    return false;
+
+  VectorType const *DestVecTy = E->getType()->castAs<VectorType>();
+  QualType DestElTy = DestVecTy->getElementType();
+
+  auto TotalElementsInOutputVector = DestVecTy->getNumElements();
+
+  SmallVector<APValue, 4> ResultElements;
+  ResultElements.reserve(TotalElementsInOutputVector);
+  for (unsigned EltNum = 0; EltNum < TotalElementsInOutputVector; ++EltNum) {
+    APValue Elt;
+    if (!handleVectorShuffle(Info, E, DestElTy, VecVal1, VecVal2, EltNum, Elt))
+      return false;
+    ResultElements.push_back(std::move(Elt));
+  }
+
+  return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+}
+
 //===----------------------------------------------------------------------===//
 // Array Evaluation
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 8cd0c198d9a8..17f95e7f3cac 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -110,18 +110,37 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
     if (!this->visit(SubExpr))
       return false;
 
-    unsigned DerivedOffset = collectBaseOffset(getRecordTy(CE->getType()),
-                                               getRecordTy(SubExpr->getType()));
+    const auto extractRecordDecl = [](QualType Ty) -> const CXXRecordDecl * {
+      if (const auto *PT = dyn_cast<PointerType>(Ty))
+        return PT->getPointeeType()->getAsCXXRecordDecl();
+      return Ty->getAsCXXRecordDecl();
+    };
+
+    // FIXME: We can express a series of non-virtual casts as a single
+    // GetPtrBasePop op.
+    QualType CurType = SubExpr->getType();
+    for (const CXXBaseSpecifier *B : CE->path()) {
+      if (B->isVirtual()) {
+        if (!this->emitGetPtrVirtBasePop(extractRecordDecl(B->getType()), CE))
+          return false;
+        CurType = B->getType();
+      } else {
+        unsigned DerivedOffset = collectBaseOffset(B->getType(), CurType);
+        if (!this->emitGetPtrBasePop(DerivedOffset, CE))
+          return false;
+        CurType = B->getType();
+      }
+    }
 
-    return this->emitGetPtrBasePop(DerivedOffset, CE);
+    return true;
   }
 
   case CK_BaseToDerived: {
     if (!this->visit(SubExpr))
       return false;
 
-    unsigned DerivedOffset = collectBaseOffset(getRecordTy(SubExpr->getType()),
-                                               getRecordTy(CE->getType()));
+    unsigned DerivedOffset =
+        collectBaseOffset(SubExpr->getType(), CE->getType());
 
     return this->emitGetPtrDerivedPop(DerivedOffset, CE);
   }
@@ -193,6 +212,13 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
     if (!this->visit(SubExpr))
       return false;
 
+    // If SubExpr doesn't result in a pointer, make it one.
+    if (PrimType FromT = classifyPrim(SubExpr->getType()); FromT != PT_Ptr) {
+      assert(isPtrType(FromT));
+      if (!this->emitDecayPtr(FromT, PT_Ptr, CE))
+        return false;
+    }
+
     PrimType T = classifyPrim(CE->getType());
     if (T == PT_IntAP)
       return this->emitCastPointerIntegralAP(Ctx.getBitWidth(CE->getType()),
@@ -905,8 +931,31 @@ bool ByteCodeExprGen<Emitter>::VisitImplicitValueInitExpr(const ImplicitValueIni
   if (std::optional<PrimType> T = classify(QT))
     return this->visitZeroInitializer(*T, QT, E);
 
-  if (QT->isRecordType())
-    return false;
+  if (QT->isRecordType()) {
+    const RecordDecl *RD = QT->getAsRecordDecl();
+    assert(RD);
+    if (RD->isInvalidDecl())
+      return false;
+    if (RD->isUnion()) {
+      // C++11 [dcl.init]p5: If T is a (possibly cv-qualified) union type, the
+      // object's first non-static named data member is zero-initialized
+      // FIXME
+      return false;
+    }
+
+    if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD);
+        CXXRD && CXXRD->getNumVBases() > 0) {
+      // TODO: Diagnose.
+      return false;
+    }
+
+    const Record *R = getRecord(QT);
+    if (!R)
+      return false;
+
+    assert(Initializing);
+    return this->visitZeroRecordInitializer(R, E);
+  }
 
   if (QT->isIncompleteArrayType())
     return true;
@@ -981,122 +1030,98 @@ bool ByteCodeExprGen<Emitter>::VisitArraySubscriptExpr(
 
 template <class Emitter>
 bool ByteCodeExprGen<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
+                                             const Expr *ArrayFiller,
                                              const Expr *E) {
-  assert(E->getType()->isRecordType());
-  const Record *R = getRecord(E->getType());
+  if (E->getType()->isVoidType())
+    return this->emitInvalid(E);
 
-  if (Inits.size() == 1 && E->getType() == Inits[0]->getType()) {
-    return this->visitInitializer(Inits[0]);
+  // Handle discarding first.
+  if (DiscardResult) {
+    for (const Expr *Init : Inits) {
+      if (!this->discard(Init))
+        return false;
+    }
+    return true;
   }
 
-  unsigned InitIndex = 0;
-  for (const Expr *Init : Inits) {
-    // Skip unnamed bitfields.
-    while (InitIndex < R->getNumFields() &&
-           R->getField(InitIndex)->Decl->isUnnamedBitField())
-      ++InitIndex;
+  // Primitive values.
+  if (std::optional<PrimType> T = classify(E->getType())) {
+    assert(!DiscardResult);
+    if (Inits.size() == 0)
+      return this->visitZeroInitializer(*T, E->getType(), E);
+    assert(Inits.size() == 1);
+    return this->delegate(Inits[0]);
+  }
 
-    if (!this->emitDupPtr(E))
-      return false;
+  QualType T = E->getType();
+  if (T->isRecordType()) {
+    const Record *R = getRecord(E->getType());
 
-    if (std::optional<PrimType> T = classify(Init)) {
-      const Record::Field *FieldToInit = R->getField(InitIndex);
-      if (!this->visit(Init))
-        return false;
+    if (Inits.size() == 1 && E->getType() == Inits[0]->getType()) {
+      return this->visitInitializer(Inits[0]);
+    }
 
-      if (FieldToInit->isBitField()) {
-        if (!this->emitInitBitField(*T, FieldToInit, E))
-          return false;
-      } else {
-        if (!this->emitInitField(*T, FieldToInit->Offset, E))
-          return false;
-      }
+    unsigned InitIndex = 0;
+    for (const Expr *Init : Inits) {
+      // Skip unnamed bitfields.
+      while (InitIndex < R->getNumFields() &&
+             R->getField(InitIndex)->Decl->isUnnamedBitField())
+        ++InitIndex;
 
-      if (!this->emitPopPtr(E))
+      if (!this->emitDupPtr(E))
         return false;
-      ++InitIndex;
-    } else {
-      // Initializer for a direct base class.
-      if (const Record::Base *B = R->getBase(Init->getType())) {
-        if (!this->emitGetPtrBasePop(B->Offset, Init))
-          return false;
 
-        if (!this->visitInitializer(Init))
-          return false;
-
-        if (!this->emitFinishInitPop(E))
-          return false;
-        // Base initializers don't increase InitIndex, since they don't count
-        // into the Record's fields.
-      } else {
+      if (std::optional<PrimType> T = classify(Init)) {
         const Record::Field *FieldToInit = R->getField(InitIndex);
-        // Non-primitive case. Get a pointer to the field-to-initialize
-        // on the stack and recurse into visitInitializer().
-        if (!this->emitGetPtrField(FieldToInit->Offset, Init))
+        if (!this->visit(Init))
           return false;
 
-        if (!this->visitInitializer(Init))
-          return false;
+        if (FieldToInit->isBitField()) {
+          if (!this->emitInitBitField(*T, FieldToInit, E))
+            return false;
+        } else {
+          if (!this->emitInitField(*T, FieldToInit->Offset, E))
+            return false;
+        }
 
         if (!this->emitPopPtr(E))
           return false;
         ++InitIndex;
-      }
-    }
-  }
-  return true;
-}
+      } else {
+        // Initializer for a direct base class.
+        if (const Record::Base *B = R->getBase(Init->getType())) {
+          if (!this->emitGetPtrBasePop(B->Offset, Init))
+            return false;
 
-/// Pointer to the array(not the element!) must be on the stack when calling
-/// this.
-template <class Emitter>
-bool ByteCodeExprGen<Emitter>::visitArrayElemInit(unsigned ElemIndex,
-                                                  const Expr *Init) {
-  if (std::optional<PrimType> T = classify(Init->getType())) {
-    // Visit the primitive element like normal.
-    if (!this->visit(Init))
-      return false;
-    return this->emitInitElem(*T, ElemIndex, Init);
-  }
+          if (!this->visitInitializer(Init))
+            return false;
 
-  // Advance the pointer currently on the stack to the given
-  // dimension.
-  if (!this->emitConstUint32(ElemIndex, Init))
-    return false;
-  if (!this->emitArrayElemPtrUint32(Init))
-    return false;
-  if (!this->visitInitializer(Init))
-    return false;
-  return this->emitFinishInitPop(Init);
-}
+          if (!this->emitFinishInitPop(E))
+            return false;
+          // Base initializers don't increase InitIndex, since they don't count
+          // into the Record's fields.
+        } else {
+          const Record::Field *FieldToInit = R->getField(InitIndex);
+          // Non-primitive case. Get a pointer to the field-to-initialize
+          // on the stack and recurse into visitInitializer().
+          if (!this->emitGetPtrField(FieldToInit->Offset, Init))
+            return false;
 
-template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
-  // Handle discarding first.
-  if (DiscardResult) {
-    for (const Expr *Init : E->inits()) {
-      if (!this->discard(Init))
-        return false;
+          if (!this->visitInitializer(Init))
+            return false;
+
+          if (!this->emitPopPtr(E))
+            return false;
+          ++InitIndex;
+        }
+      }
     }
     return true;
   }
 
-  // Primitive values.
-  if (std::optional<PrimType> T = classify(E->getType())) {
-    assert(!DiscardResult);
-    if (E->getNumInits() == 0)
-      return this->visitZeroInitializer(*T, E->getType(), E);
-    assert(E->getNumInits() == 1);
-    return this->delegate(E->inits()[0]);
-  }
-
-  QualType T = E->getType();
-  if (T->isRecordType())
-    return this->visitInitList(E->inits(), E);
-
   if (T->isArrayType()) {
     unsigned ElementIndex = 0;
-    for (const Expr *Init : E->inits()) {
+    for (const Expr *Init : Inits) {
       if (!this->visitArrayElemInit(ElementIndex, Init))
         return false;
       ++ElementIndex;
@@ -1104,13 +1129,13 @@ bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
 
     // Expand the filler expression.
     // FIXME: This should go away.
-    if (const Expr *Filler = E->getArrayFiller()) {
+    if (ArrayFiller) {
       const ConstantArrayType *CAT =
           Ctx.getASTContext().getAsConstantArrayType(E->getType());
       uint64_t NumElems = CAT->getZExtSize();
 
       for (; ElementIndex != NumElems; ++ElementIndex) {
-        if (!this->visitArrayElemInit(ElementIndex, Filler))
+        if (!this->visitArrayElemInit(ElementIndex, ArrayFiller))
           return false;
       }
     }
@@ -1119,10 +1144,10 @@ bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
   }
 
   if (const auto *ComplexTy = E->getType()->getAs<ComplexType>()) {
-    unsigned NumInits = E->getNumInits();
+    unsigned NumInits = Inits.size();
 
     if (NumInits == 1)
-      return this->delegate(E->inits()[0]);
+      return this->delegate(Inits[0]);
 
     QualType ElemQT = ComplexTy->getElementType();
     PrimType ElemT = classifyPrim(ElemQT);
@@ -1136,7 +1161,7 @@ bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
       }
     } else if (NumInits == 2) {
       unsigned InitIndex = 0;
-      for (const Expr *Init : E->inits()) {
+      for (const Expr *Init : Inits) {
         if (!this->visit(Init))
           return false;
 
@@ -1150,14 +1175,14 @@ bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
 
   if (const auto *VecT = E->getType()->getAs<VectorType>()) {
     unsigned NumVecElements = VecT->getNumElements();
-    assert(NumVecElements >= E->getNumInits());
+    assert(NumVecElements >= Inits.size());
 
     QualType ElemQT = VecT->getElementType();
     PrimType ElemT = classifyPrim(ElemQT);
 
     // All initializer elements.
     unsigned InitIndex = 0;
-    for (const Expr *Init : E->inits()) {
+    for (const Expr *Init : Inits) {
       if (!this->visit(Init))
         return false;
 
@@ -1179,19 +1204,38 @@ bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
   return false;
 }
 
+/// Pointer to the array(not the element!) must be on the stack when calling
+/// this.
 template <class Emitter>
-bool ByteCodeExprGen<Emitter>::VisitCXXParenListInitExpr(
-    const CXXParenListInitExpr *E) {
-  if (DiscardResult) {
-    for (const Expr *Init : E->getInitExprs()) {
-      if (!this->discard(Init))
-        return false;
-    }
-    return true;
+bool ByteCodeExprGen<Emitter>::visitArrayElemInit(unsigned ElemIndex,
+                                                  const Expr *Init) {
+  if (std::optional<PrimType> T = classify(Init->getType())) {
+    // Visit the primitive element like normal.
+    if (!this->visit(Init))
+      return false;
+    return this->emitInitElem(*T, ElemIndex, Init);
   }
 
-  assert(E->getType()->isRecordType());
-  return this->visitInitList(E->getInitExprs(), E);
+  // Advance the pointer currently on the stack to the given
+  // dimension.
+  if (!this->emitConstUint32(ElemIndex, Init))
+    return false;
+  if (!this->emitArrayElemPtrUint32(Init))
+    return false;
+  if (!this->visitInitializer(Init))
+    return false;
+  return this->emitFinishInitPop(Init);
+}
+
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
+  return this->visitInitList(E->inits(), E->getArrayFiller(), E);
+}
+
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitCXXParenListInitExpr(
+    const CXXParenListInitExpr *E) {
+  return this->visitInitList(E->getInitExprs(), E->getArrayFiller(), E);
 }
 
 template <class Emitter>
@@ -1314,6 +1358,20 @@ bool ByteCodeExprGen<Emitter>::VisitUnaryExprOrTypeTraitExpr(
     assert(E->getTypeOfArgument()->isSizelessVectorType());
   }
 
+  if (Kind == UETT_VecStep) {
+    if (const auto *VT = E->getTypeOfArgument()->getAs<VectorType>()) {
+      unsigned N = VT->getNumElements();
+
+      // The vec_step built-in functions that take a 3-component
+      // vector return 4. (OpenCL 1.1 spec 6.11.12)
+      if (N == 3)
+        N = 4;
+
+      return this->emitConst(N, E);
+    }
+    return this->emitConst(1, E);
+  }
+
   return false;
 }
 
@@ -2321,8 +2379,7 @@ bool ByteCodeExprGen<Emitter>::VisitCXXUuidofExpr(const CXXUuidofExpr *E) {
   if (!this->emitGetPtrGlobal(*GlobalIndex, E))
     return false;
 
-  const Record *R = this->getRecord(E->getType());
-  assert(R);
+  assert(this->getRecord(E->getType()));
 
   const APValue &V = E->getGuidDecl()->getAsAPValue();
   if (V.getKind() == APValue::None)
@@ -2330,41 +2387,8 @@ bool ByteCodeExprGen<Emitter>::VisitCXXUuidofExpr(const CXXUuidofExpr *E) {
 
   assert(V.isStruct());
   assert(V.getStructNumBases() == 0);
-  // FIXME: This could be useful in visitAPValue, too.
-  for (unsigned I = 0, N = V.getStructNumFields(); I != N; ++I) {
-    const APValue &F = V.getStructField(I);
-    const Record::Field *RF = R->getField(I);
-
-    if (F.isInt()) {
-      PrimType T = classifyPrim(RF->Decl->getType());
-      if (!this->visitAPValue(F, T, E))
-        return false;
-      if (!this->emitInitField(T, RF->Offset, E))
-        return false;
-    } else if (F.isArray()) {
-      assert(RF->Desc->isPrimitiveArray());
-      const auto *ArrType = RF->Decl->getType()->getAsArrayTypeUnsafe();
-      PrimType ElemT = classifyPrim(ArrType->getElementType());
-      assert(ArrType);
-
-      if (!this->emitDupPtr(E))
-        return false;
-      if (!this->emitGetPtrField(RF->Offset, E))
-        return false;
-
-      for (unsigned A = 0, AN = F.getArraySize(); A != AN; ++A) {
-        if (!this->visitAPValue(F.getArrayInitializedElt(A), ElemT, E))
-          return false;
-        if (!this->emitInitElem(ElemT, A, E))
-          return false;
-      }
-
-      if (!this->emitPopPtr(E))
-        return false;
-    } else {
-      assert(false && "I don't think this should be possible");
-    }
-  }
+  if (!this->visitAPValueInitializer(V, E))
+    return false;
 
   return this->emitFinishInit(E);
 }
@@ -2930,6 +2954,54 @@ bool ByteCodeExprGen<Emitter>::visitAPValue(const APValue &Val,
 }
 
 template <class Emitter>
+bool ByteCodeExprGen<Emitter>::visitAPValueInitializer(const APValue &Val,
+                                                       const Expr *E) {
+  if (Val.isStruct()) {
+    const Record *R = this->getRecord(E->getType());
+    assert(R);
+
+    for (unsigned I = 0, N = Val.getStructNumFields(); I != N; ++I) {
+      const APValue &F = Val.getStructField(I);
+      const Record::Field *RF = R->getField(I);
+
+      if (F.isInt()) {
+        PrimType T = classifyPrim(RF->Decl->getType());
+        if (!this->visitAPValue(F, T, E))
+          return false;
+        if (!this->emitInitField(T, RF->Offset, E))
+          return false;
+      } else if (F.isArray()) {
+        assert(RF->Desc->isPrimitiveArray());
+        const auto *ArrType = RF->Decl->getType()->getAsArrayTypeUnsafe();
+        PrimType ElemT = classifyPrim(ArrType->getElementType());
+        assert(ArrType);
+
+        if (!this->emitDupPtr(E))
+          return false;
+        if (!this->emitGetPtrField(RF->Offset, E))
+          return false;
+
+        for (unsigned A = 0, AN = F.getArraySize(); A != AN; ++A) {
+          if (!this->visitAPValue(F.getArrayInitializedElt(A), ElemT, E))
+            return false;
+          if (!this->emitInitElem(ElemT, A, E))
+            return false;
+        }
+
+        if (!this->emitPopPtr(E))
+          return false;
+      } else {
+        assert(false && "I don't think this should be possible");
+      }
+    }
+    return true;
+  }
+  // TODO: Other types.
+
+  return false;
+}
+
+template <class Emitter>
 bool ByteCodeExprGen<Emitter>::VisitBuiltinCallExpr(const CallExpr *E) {
   const Function *Func = getFunction(E->getDirectCallee());
   if (!Func)
@@ -3450,9 +3522,17 @@ bool ByteCodeExprGen<Emitter>::VisitDeclRefExpr(const DeclRefExpr *E) {
   } else if (const auto *FuncDecl = dyn_cast<FunctionDecl>(D)) {
     const Function *F = getFunction(FuncDecl);
     return F && this->emitGetFnPtr(F, E);
-  } else if (isa<TemplateParamObjectDecl>(D)) {
-    if (std::optional<unsigned> Index = P.getOrCreateGlobal(D))
-      return this->emitGetPtrGlobal(*Index, E);
+  } else if (const auto *TPOD = dyn_cast<TemplateParamObjectDecl>(D)) {
+    if (std::optional<unsigned> Index = P.getOrCreateGlobal(D)) {
+      if (!this->emitGetPtrGlobal(*Index, E))
+        return false;
+      if (std::optional<PrimType> T = classify(E->getType())) {
+        if (!this->visitAPValue(TPOD->getValue(), *T, E))
+          return false;
+        return this->emitInitGlobal(*T, *Index, E);
+      }
+      return this->visitAPValueInitializer(TPOD->getValue(), E);
+    }
     return false;
   }
 
@@ -3529,35 +3609,17 @@ void ByteCodeExprGen<Emitter>::emitCleanup() {
 
 template <class Emitter>
 unsigned
-ByteCodeExprGen<Emitter>::collectBaseOffset(const RecordType *BaseType,
-                                            const RecordType *DerivedType) {
-  assert(BaseType);
-  assert(DerivedType);
-  const auto *FinalDecl = cast<CXXRecordDecl>(BaseType->getDecl());
-  const RecordDecl *CurDecl = DerivedType->getDecl();
-  const Record *CurRecord = getRecord(CurDecl);
-  assert(CurDecl && FinalDecl);
-
-  unsigned OffsetSum = 0;
-  for (;;) {
-    assert(CurRecord->getNumBases() > 0);
-    // One level up
-    for (const Record::Base &B : CurRecord->bases()) {
-      const auto *BaseDecl = cast<CXXRecordDecl>(B.Decl);
-
-      if (BaseDecl == FinalDecl || BaseDecl->isDerivedFrom(FinalDecl)) {
-        OffsetSum += B.Offset;
-        CurRecord = B.R;
-        CurDecl = BaseDecl;
-        break;
-      }
-    }
-    if (CurDecl == FinalDecl)
-      break;
-  }
+ByteCodeExprGen<Emitter>::collectBaseOffset(const QualType BaseType,
+                                            const QualType DerivedType) {
+  const auto extractRecordDecl = [](QualType Ty) -> const CXXRecordDecl * {
+    if (const auto *PT = dyn_cast<PointerType>(Ty))
+      return PT->getPointeeType()->getAsCXXRecordDecl();
+    return Ty->getAsCXXRecordDecl();
+  };
+  const CXXRecordDecl *BaseDecl = extractRecordDecl(BaseType);
+  const CXXRecordDecl *DerivedDecl = extractRecordDecl(DerivedType);
 
-  assert(OffsetSum > 0);
-  return OffsetSum;
+  return Ctx.collectBaseOffset(BaseDecl, DerivedDecl);
 }
 
 /// Emit casts from a PrimType to another PrimType.
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index 7e9dc8631fc0..a89e37c67aa6 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -181,6 +181,7 @@ protected:
   bool visitVarDecl(const VarDecl *VD);
   /// Visit an APValue.
   bool visitAPValue(const APValue &Val, PrimType ValType, const Expr *E);
+  bool visitAPValueInitializer(const APValue &Val, const Expr *E);
 
   /// Visits an expression and converts it to a boolean.
   bool visitBool(const Expr *E);
@@ -224,7 +225,8 @@ protected:
     return this->emitFinishInitPop(I);
   }
 
-  bool visitInitList(ArrayRef<const Expr *> Inits, const Expr *E);
+  bool visitInitList(ArrayRef<const Expr *> Inits, const Expr *ArrayFiller,
+                     const Expr *E);
   bool visitArrayElemInit(unsigned ElemIndex, const Expr *Init);
 
   /// Creates a local primitive value.
@@ -283,8 +285,8 @@ private:
 
   bool emitRecordDestruction(const Record *R);
   bool emitDestruction(const Descriptor *Desc);
-  unsigned collectBaseOffset(const RecordType *BaseType,
-                             const RecordType *DerivedType);
+  unsigned collectBaseOffset(const QualType BaseType,
+                             const QualType DerivedType);
 
 protected:
   /// Variable to storage mapping.
diff --git a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
index 36dab6252ece..ff91baf595f1 100644
--- a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
@@ -189,14 +189,23 @@ bool ByteCodeStmtGen<Emitter>::visitFunc(const FunctionDecl *F) {
         if (!emitFieldInitializer(F, F->Offset, InitExpr))
           return false;
       } else if (const Type *Base = Init->getBaseClass()) {
-        // Base class initializer.
-        // Get This Base and call initializer on it.
         const auto *BaseDecl = Base->getAsCXXRecordDecl();
         assert(BaseDecl);
-        const Record::Base *B = R->getBase(BaseDecl);
-        assert(B);
-        if (!this->emitGetPtrThisBase(B->Offset, InitExpr))
-          return false;
+
+        if (Init->isBaseVirtual()) {
+          assert(R->getVirtualBase(BaseDecl));
+          if (!this->emitGetPtrThisVirtBase(BaseDecl, InitExpr))
+            return false;
+
+        } else {
+          // Base class initializer.
+          // Get This Base and call initializer on it.
+          const Record::Base *B = R->getBase(BaseDecl);
+          assert(B);
+          if (!this->emitGetPtrThisBase(B->Offset, InitExpr))
+            return false;
+        }
+
         if (!this->visitInitializer(InitExpr))
           return false;
         if (!this->emitFinishInitPop(InitExpr))
@@ -323,7 +332,8 @@ bool ByteCodeStmtGen<Emitter>::visitCompoundStmt(
 template <class Emitter>
 bool ByteCodeStmtGen<Emitter>::visitDeclStmt(const DeclStmt *DS) {
   for (auto *D : DS->decls()) {
-    if (isa<StaticAssertDecl, TagDecl, TypedefNameDecl, UsingEnumDecl>(D))
+    if (isa<StaticAssertDecl, TagDecl, TypedefNameDecl, UsingEnumDecl,
+            FunctionDecl>(D))
       continue;
 
     const auto *VD = dyn_cast<VarDecl>(D);
diff --git a/clang/lib/AST/Interp/Context.cpp b/clang/lib/AST/Interp/Context.cpp
index 274178837bf0..d51a57e5e92e 100644
--- a/clang/lib/AST/Interp/Context.cpp
+++ b/clang/lib/AST/Interp/Context.cpp
@@ -262,3 +262,36 @@ const Function *Context::getOrCreateFunction(const FunctionDecl *FD) {
 
   return Func;
 }
+
+unsigned Context::collectBaseOffset(const RecordDecl *BaseDecl,
+                                    const RecordDecl *DerivedDecl) const {
+  assert(BaseDecl);
+  assert(DerivedDecl);
+  const auto *FinalDecl = cast<CXXRecordDecl>(BaseDecl);
+  const RecordDecl *CurDecl = DerivedDecl;
+  const Record *CurRecord = P->getOrCreateRecord(CurDecl);
+  assert(CurDecl && FinalDecl);
+
+  unsigned OffsetSum = 0;
+  for (;;) {
+    assert(CurRecord->getNumBases() > 0);
+    // One level up
+    for (const Record::Base &B : CurRecord->bases()) {
+      const auto *BaseDecl = cast<CXXRecordDecl>(B.Decl);
+
+      if (BaseDecl == FinalDecl || BaseDecl->isDerivedFrom(FinalDecl)) {
+        OffsetSum += B.Offset;
+        CurRecord = B.R;
+        CurDecl = BaseDecl;
+        break;
+      }
+    }
+    if (CurDecl == FinalDecl)
+      break;
+
+    // break;
+  }
+
+  assert(OffsetSum > 0);
+  return OffsetSum;
+}
diff --git a/clang/lib/AST/Interp/Context.h b/clang/lib/AST/Interp/Context.h
index 23c439ad8912..360e9499d084 100644
--- a/clang/lib/AST/Interp/Context.h
+++ b/clang/lib/AST/Interp/Context.h
@@ -104,6 +104,9 @@ public:
   /// Returns the program. This is only needed for unittests.
   Program &getProgram() const { return *P.get(); }
 
+  unsigned collectBaseOffset(const RecordDecl *BaseDecl,
+                             const RecordDecl *DerivedDecl) const;
+
 private:
   /// Runs a function.
   bool Run(State &Parent, const Function *Func, APValue &Result);
diff --git a/clang/lib/AST/Interp/Descriptor.cpp b/clang/lib/AST/Interp/Descriptor.cpp
index a4ccc0236d29..954c58c8cb37 100644
--- a/clang/lib/AST/Interp/Descriptor.cpp
+++ b/clang/lib/AST/Interp/Descriptor.cpp
@@ -136,28 +136,66 @@ static void moveArrayDesc(Block *B, const std::byte *Src, std::byte *Dst,
   }
 }
 
+static void initField(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
+                      bool IsActive, const Descriptor *D,
+                      unsigned FieldOffset) {
+  bool IsUnion = false; // FIXME
+  auto *Desc = reinterpret_cast<InlineDescriptor *>(Ptr + FieldOffset) - 1;
+  Desc->Offset = FieldOffset;
+  Desc->Desc = D;
+  Desc->IsInitialized = D->IsArray;
+  Desc->IsBase = false;
+  Desc->IsActive = IsActive && !IsUnion;
+  Desc->IsConst = IsConst || D->IsConst;
+  Desc->IsFieldMutable = IsMutable || D->IsMutable;
+
+  if (auto Fn = D->CtorFn)
+    Fn(B, Ptr + FieldOffset, Desc->IsConst, Desc->IsFieldMutable,
+       Desc->IsActive, D);
+}
+
+static void initBase(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
+                     bool IsActive, const Descriptor *D, unsigned FieldOffset,
+                     bool IsVirtualBase) {
+  assert(D);
+  assert(D->ElemRecord);
+
+  bool IsUnion = D->ElemRecord->isUnion();
+  auto *Desc = reinterpret_cast<InlineDescriptor *>(Ptr + FieldOffset) - 1;
+  Desc->Offset = FieldOffset;
+  Desc->Desc = D;
+  Desc->IsInitialized = D->IsArray;
+  Desc->IsBase = true;
+  Desc->IsActive = IsActive && !IsUnion;
+  Desc->IsConst = IsConst || D->IsConst;
+  Desc->IsFieldMutable = IsMutable || D->IsMutable;
+
+  for (const auto &V : D->ElemRecord->bases())
+    initBase(B, Ptr + FieldOffset, IsConst, IsMutable, IsActive, V.Desc,
+             V.Offset, false);
+  for (const auto &F : D->ElemRecord->fields())
+    initField(B, Ptr + FieldOffset, IsConst, IsMutable, IsActive, F.Desc,
+              F.Offset);
+
+  // If this is initializing a virtual base, we do NOT want to consider its
+  // virtual bases, those are already flattened into the parent record when
+  // creating it.
+  if (IsVirtualBase)
+    return;
+
+  for (const auto &V : D->ElemRecord->virtual_bases())
+    initBase(B, Ptr + FieldOffset, IsConst, IsMutable, IsActive, V.Desc,
+             V.Offset, true);
+}
+
 static void ctorRecord(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
                        bool IsActive, const Descriptor *D) {
-  const bool IsUnion = D->ElemRecord->isUnion();
-  auto CtorSub = [=](unsigned SubOff, const Descriptor *F, bool IsBase) {
-    auto *Desc = reinterpret_cast<InlineDescriptor *>(Ptr + SubOff) - 1;
-    Desc->Offset = SubOff;
-    Desc->Desc = F;
-    Desc->IsInitialized = F->IsArray && !IsBase;
-    Desc->IsBase = IsBase;
-    Desc->IsActive = IsActive && !IsUnion;
-    Desc->IsConst = IsConst || F->IsConst;
-    Desc->IsFieldMutable = IsMutable || F->IsMutable;
-    if (auto Fn = F->CtorFn)
-      Fn(B, Ptr + SubOff, Desc->IsConst, Desc->IsFieldMutable, Desc->IsActive,
-         F);
-  };
-  for (const auto &B : D->ElemRecord->bases())
-    CtorSub(B.Offset, B.Desc, /*isBase=*/true);
+  for (const auto &V : D->ElemRecord->bases())
+    initBase(B, Ptr, IsConst, IsMutable, IsActive, V.Desc, V.Offset, false);
   for (const auto &F : D->ElemRecord->fields())
-    CtorSub(F.Offset, F.Desc, /*isBase=*/false);
+    initField(B, Ptr, IsConst, IsMutable, IsActive, F.Desc, F.Offset);
   for (const auto &V : D->ElemRecord->virtual_bases())
-    CtorSub(V.Offset, V.Desc, /*isBase=*/true);
+    initBase(B, Ptr, IsConst, IsMutable, IsActive, V.Desc, V.Offset, true);
 }
 
 static void dtorRecord(Block *B, std::byte *Ptr, const Descriptor *D) {
diff --git a/clang/lib/AST/Interp/Descriptor.h b/clang/lib/AST/Interp/Descriptor.h
index c386fc8ac7b0..cd20495c259c 100644
--- a/clang/lib/AST/Interp/Descriptor.h
+++ b/clang/lib/AST/Interp/Descriptor.h
@@ -82,6 +82,9 @@ struct InlineDescriptor {
   InlineDescriptor(const Descriptor *D)
       : Offset(sizeof(InlineDescriptor)), IsConst(false), IsInitialized(false),
         IsBase(false), IsActive(false), IsFieldMutable(false), Desc(D) {}
+
+  void dump() const { dump(llvm::errs()); }
+  void dump(llvm::raw_ostream &OS) const;
 };
 
 /// Describes a memory block created by an allocation site.
diff --git a/clang/lib/AST/Interp/Disasm.cpp b/clang/lib/AST/Interp/Disasm.cpp
index d127f33223e8..ccdc96a79436 100644
--- a/clang/lib/AST/Interp/Disasm.cpp
+++ b/clang/lib/AST/Interp/Disasm.cpp
@@ -200,7 +200,7 @@ LLVM_DUMP_METHOD void Descriptor::dump(llvm::raw_ostream &OS) const {
     OS << " primitive";
 
   if (isZeroSizeArray())
-    OS << " zero-size-arrary";
+    OS << " zero-size-array";
   else if (isUnknownSizeArray())
     OS << " unknown-size-array";
 
@@ -208,6 +208,25 @@ LLVM_DUMP_METHOD void Descriptor::dump(llvm::raw_ostream &OS) const {
     OS << " dummy";
 }
 
+LLVM_DUMP_METHOD void InlineDescriptor::dump(llvm::raw_ostream &OS) const {
+  {
+    ColorScope SC(OS, true, {llvm::raw_ostream::BLUE, true});
+    OS << "InlineDescriptor " << (const void *)this << "\n";
+  }
+  OS << "Offset: " << Offset << "\n";
+  OS << "IsConst: " << IsConst << "\n";
+  OS << "IsInitialized: " << IsInitialized << "\n";
+  OS << "IsBase: " << IsBase << "\n";
+  OS << "IsActive: " << IsActive << "\n";
+  OS << "IsFieldMutable: " << IsFieldMutable << "\n";
+  OS << "Desc: ";
+  if (Desc)
+    Desc->dump(OS);
+  else
+    OS << "nullptr";
+  OS << "\n";
+}
+
 LLVM_DUMP_METHOD void InterpFrame::dump(llvm::raw_ostream &OS,
                                         unsigned Indent) const {
   unsigned Spaces = Indent * 2;
@@ -251,8 +270,6 @@ LLVM_DUMP_METHOD void Record::dump(llvm::raw_ostream &OS, unsigned Indentation,
     ++I;
   }
 
-  // FIXME: Virtual bases.
-
   I = 0;
   for (const Record::Field &F : fields()) {
     OS.indent(Indent) << "- Field " << I << ": ";
@@ -263,6 +280,14 @@ LLVM_DUMP_METHOD void Record::dump(llvm::raw_ostream &OS, unsigned Indentation,
     OS << ". Offset " << (Offset + F.Offset) << "\n";
     ++I;
   }
+
+  I = 0;
+  for (const Record::Base &B : virtual_bases()) {
+    OS.indent(Indent) << "- Virtual Base " << I << ". Offset "
+                      << (Offset + B.Offset) << "\n";
+    B.R->dump(OS, Indentation + 1, Offset + B.Offset);
+    ++I;
+  }
 }
 
 LLVM_DUMP_METHOD void Block::dump(llvm::raw_ostream &OS) const {
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 9283f697c007..9da0286deada 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -1355,20 +1355,26 @@ inline bool VirtBaseHelper(InterpState &S, CodePtr OpPC, const RecordDecl *Decl,
   while (Base.isBaseClass())
     Base = Base.getBase();
 
-  auto *Field = Base.getRecord()->getVirtualBase(Decl);
-  S.Stk.push<Pointer>(Base.atField(Field->Offset));
+  const Record::Base *VirtBase = Base.getRecord()->getVirtualBase(Decl);
+  S.Stk.push<Pointer>(Base.atField(VirtBase->Offset));
   return true;
 }
 
-inline bool GetPtrVirtBase(InterpState &S, CodePtr OpPC, const RecordDecl *D) {
+inline bool GetPtrVirtBasePop(InterpState &S, CodePtr OpPC,
+                              const RecordDecl *D) {
+  assert(D);
   const Pointer &Ptr = S.Stk.pop<Pointer>();
   if (!CheckNull(S, OpPC, Ptr, CSK_Base))
     return false;
+  if (Ptr.isDummy()) // FIXME: Once we have type info for dummy pointers, this
+                     // needs to go.
+    return false;
   return VirtBaseHelper(S, OpPC, D, Ptr);
 }
 
 inline bool GetPtrThisVirtBase(InterpState &S, CodePtr OpPC,
                                const RecordDecl *D) {
+  assert(D);
   if (S.checkingPotentialConstantExpression())
     return false;
   const Pointer &This = S.Current->getThis();
diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td
index 742785b28eb4..2a97b978b523 100644
--- a/clang/lib/AST/Interp/Opcodes.td
+++ b/clang/lib/AST/Interp/Opcodes.td
@@ -336,7 +336,7 @@ def GetPtrDerivedPop : Opcode {
 }
 
 // [Pointer] -> [Pointer]
-def GetPtrVirtBase : Opcode {
+def GetPtrVirtBasePop : Opcode {
   // RecordDecl of base class.
   let Args = [ArgRecordDecl];
 }
diff --git a/clang/lib/AST/Interp/Program.cpp b/clang/lib/AST/Interp/Program.cpp
index 3773e0662f78..02075c20cf55 100644
--- a/clang/lib/AST/Interp/Program.cpp
+++ b/clang/lib/AST/Interp/Program.cpp
@@ -173,7 +173,8 @@ std::optional<unsigned> Program::createGlobal(const ValueDecl *VD,
   if (const auto *Var = dyn_cast<VarDecl>(VD)) {
     IsStatic = Context::shouldBeGloballyIndexed(VD);
     IsExtern = Var->hasExternalStorage();
-  } else if (isa<UnnamedGlobalConstantDecl, MSGuidDecl>(VD)) {
+  } else if (isa<UnnamedGlobalConstantDecl, MSGuidDecl,
+                 TemplateParamObjectDecl>(VD)) {
     IsStatic = true;
     IsExtern = false;
   } else {
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 8aaa6801d85b..68e81f45b4c2 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2510,6 +2510,18 @@ bool Type::isSveVLSBuiltinType() const {
   return false;
 }
 
+QualType Type::getSizelessVectorEltType(const ASTContext &Ctx) const {
+  assert(isSizelessVectorType() && "Must be sizeless vector type");
+  // Currently supports SVE and RVV
+  if (isSVESizelessBuiltinType())
+    return getSveEltType(Ctx);
+
+  if (isRVVSizelessBuiltinType())
+    return getRVVEltType(Ctx);
+
+  llvm_unreachable("Unhandled type");
+}
+
 QualType Type::getSveEltType(const ASTContext &Ctx) const {
   assert(isSveVLSBuiltinType() && "unsupported type!");
 
diff --git a/clang/lib/Analysis/FlowSensitive/ASTOps.cpp b/clang/lib/Analysis/FlowSensitive/ASTOps.cpp
index 619bf772bba5..bd1676583ecc 100644
--- a/clang/lib/Analysis/FlowSensitive/ASTOps.cpp
+++ b/clang/lib/Analysis/FlowSensitive/ASTOps.cpp
@@ -33,12 +33,20 @@ namespace clang::dataflow {
 
 const Expr &ignoreCFGOmittedNodes(const Expr &E) {
   const Expr *Current = &E;
-  if (auto *EWC = dyn_cast<ExprWithCleanups>(Current)) {
-    Current = EWC->getSubExpr();
+  const Expr *Last = nullptr;
+  while (Current != Last) {
+    Last = Current;
+    if (auto *EWC = dyn_cast<ExprWithCleanups>(Current)) {
+      Current = EWC->getSubExpr();
+      assert(Current != nullptr);
+    }
+    if (auto *CE = dyn_cast<ConstantExpr>(Current)) {
+      Current = CE->getSubExpr();
+      assert(Current != nullptr);
+    }
+    Current = Current->IgnoreParens();
     assert(Current != nullptr);
   }
-  Current = Current->IgnoreParens();
-  assert(Current != nullptr);
   return *Current;
 }
 
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index 43fdfa5abcbb..fd224aeb79b1 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -41,7 +41,11 @@ namespace dataflow {
 
 const Environment *StmtToEnvMap::getEnvironment(const Stmt &S) const {
   auto BlockIt = ACFG.getStmtToBlock().find(&ignoreCFGOmittedNodes(S));
-  assert(BlockIt != ACFG.getStmtToBlock().end());
+  if (BlockIt == ACFG.getStmtToBlock().end()) {
+    assert(false);
+    // Return null to avoid dereferencing the end iterator in non-assert builds.
+    return nullptr;
+  }
   if (!ACFG.isBlockReachable(*BlockIt->getSecond()))
     return nullptr;
   if (BlockIt->getSecond()->getBlockID() == CurBlockID)
diff --git a/clang/lib/Basic/Targets/ARM.h b/clang/lib/Basic/Targets/ARM.h
index e69adbe75473..df9855a52e61 100644
--- a/clang/lib/Basic/Targets/ARM.h
+++ b/clang/lib/Basic/Targets/ARM.h
@@ -225,6 +225,10 @@ public:
   bool hasBitIntType() const override { return true; }
 
   const char *getBFloat16Mangling() const override { return "u6__bf16"; };
+
+  std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
+    return std::make_pair(getTriple().isArch64Bit() ? 256 : 64, 64);
+  }
 };
 
 class LLVM_LIBRARY_VISIBILITY ARMleTargetInfo : public ARMTargetInfo {
diff --git a/clang/lib/Basic/Targets/AVR.h b/clang/lib/Basic/Targets/AVR.h
index 9376c46cd98c..feeb04f37eeb 100644
--- a/clang/lib/Basic/Targets/AVR.h
+++ b/clang/lib/Basic/Targets/AVR.h
@@ -175,6 +175,10 @@ public:
   std::optional<std::string> handleAsmEscapedChar(char EscChar) const override;
   StringRef getABI() const override { return ABI; }
 
+  std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
+    return std::make_pair(32, 32);
+  }
+
 protected:
   std::string CPU;
   StringRef ABI;
diff --git a/clang/lib/Basic/Targets/BPF.h b/clang/lib/Basic/Targets/BPF.h
index 489f29fc4fea..d19b37dd4df7 100644
--- a/clang/lib/Basic/Targets/BPF.h
+++ b/clang/lib/Basic/Targets/BPF.h
@@ -113,6 +113,10 @@ public:
     StringRef CPUName(Name);
     return isValidCPUName(CPUName);
   }
+
+  std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
+    return std::make_pair(32, 32);
+  }
 };
 } // namespace targets
 } // namespace clang
diff --git a/clang/lib/Basic/Targets/M68k.h b/clang/lib/Basic/Targets/M68k.h
index 7ffa901127e5..b732add77e03 100644
--- a/clang/lib/Basic/Targets/M68k.h
+++ b/clang/lib/Basic/Targets/M68k.h
@@ -56,6 +56,10 @@ public:
   BuiltinVaListKind getBuiltinVaListKind() const override;
   bool setCPU(const std::string &Name) override;
   CallingConvCheckResult checkCallingConvention(CallingConv CC) const override;
+
+  std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
+    return std::make_pair(32, 32);
+  }
 };
 
 } // namespace targets
diff --git a/clang/lib/Basic/Targets/Mips.h b/clang/lib/Basic/Targets/Mips.h
index 0d6e4b4d0808..730deb674aa5 100644
--- a/clang/lib/Basic/Targets/Mips.h
+++ b/clang/lib/Basic/Targets/Mips.h
@@ -431,6 +431,10 @@ public:
 
   bool validateTarget(DiagnosticsEngine &Diags) const override;
   bool hasBitIntType() const override { return true; }
+
+  std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
+    return std::make_pair(32, 32);
+  }
 };
 } // namespace targets
 } // namespace clang
diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
index 60bc1dec8f95..cd0f08dfb3bc 100644
--- a/clang/lib/Basic/Targets/PPC.h
+++ b/clang/lib/Basic/Targets/PPC.h
@@ -423,6 +423,10 @@ public:
     // This is the ELF definition
     return TargetInfo::PowerABIBuiltinVaList;
   }
+
+  std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
+    return std::make_pair(32, 32);
+  }
 };
 
 // Note: ABI differences may eventually require us to have a separate
@@ -503,6 +507,10 @@ public:
       return CCCR_Warning;
     }
   }
+
+  std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
+    return std::make_pair(128, 128);
+  }
 };
 
 class LLVM_LIBRARY_VISIBILITY AIXPPC32TargetInfo :
diff --git a/clang/lib/Basic/Targets/RISCV.h b/clang/lib/Basic/Targets/RISCV.h
index 9fa42e75bbfd..d0e9cdc6da07 100644
--- a/clang/lib/Basic/Targets/RISCV.h
+++ b/clang/lib/Basic/Targets/RISCV.h
@@ -122,6 +122,10 @@ public:
   void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values) const override;
   bool supportsTargetAttributeTune() const override { return true; }
   ParsedTargetAttr parseTargetAttr(StringRef Str) const override;
+
+  std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
+    return std::make_pair(32, 32);
+  }
 };
 class LLVM_LIBRARY_VISIBILITY RISCV32TargetInfo : public RISCVTargetInfo {
 public:
diff --git a/clang/lib/Basic/Targets/Sparc.h b/clang/lib/Basic/Targets/Sparc.h
index 214fef88e1dc..3357bee33e1a 100644
--- a/clang/lib/Basic/Targets/Sparc.h
+++ b/clang/lib/Basic/Targets/Sparc.h
@@ -140,6 +140,10 @@ public:
     CPU = getCPUKind(Name);
     return CPU != CK_GENERIC;
   }
+
+  std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
+    return std::make_pair(32, 32);
+  }
 };
 
 // SPARC v8 is the 32-bit mode selected by Triple::sparc.
diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h
index 8e302acd51b8..73d3aa01a043 100644
--- a/clang/lib/Basic/Targets/SystemZ.h
+++ b/clang/lib/Basic/Targets/SystemZ.h
@@ -220,6 +220,10 @@ public:
   int getEHDataRegisterNumber(unsigned RegNo) const override {
     return RegNo < 4 ? 6 + RegNo : -1;
   }
+
+  std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
+    return std::make_pair(256, 256);
+  }
 };
 } // namespace targets
 } // namespace clang
diff --git a/clang/lib/Basic/Targets/WebAssembly.cpp b/clang/lib/Basic/Targets/WebAssembly.cpp
index d473fd190864..3d76411f890a 100644
--- a/clang/lib/Basic/Targets/WebAssembly.cpp
+++ b/clang/lib/Basic/Targets/WebAssembly.cpp
@@ -47,6 +47,7 @@ bool WebAssemblyTargetInfo::hasFeature(StringRef Feature) const {
   return llvm::StringSwitch<bool>(Feature)
       .Case("simd128", SIMDLevel >= SIMD128)
       .Case("relaxed-simd", SIMDLevel >= RelaxedSIMD)
+      .Case("half-precision", HasHalfPrecision)
       .Case("nontrapping-fptoint", HasNontrappingFPToInt)
       .Case("sign-ext", HasSignExt)
       .Case("exception-handling", HasExceptionHandling)
@@ -156,6 +157,7 @@ bool WebAssemblyTargetInfo::initFeatureMap(
     Features["reference-types"] = true;
     Features["sign-ext"] = true;
     Features["tail-call"] = true;
+    Features["half-precision"] = true;
     setSIMDLevel(Features, SIMD128, true);
   } else if (CPU == "generic") {
     Features["mutable-globals"] = true;
@@ -216,6 +218,15 @@ bool WebAssemblyTargetInfo::handleTargetFeatures(
       HasBulkMemory = false;
       continue;
     }
+    if (Feature == "+half-precision") {
+      SIMDLevel = std::max(SIMDLevel, SIMD128);
+      HasHalfPrecision = true;
+      continue;
+    }
+    if (Feature == "-half-precision") {
+      HasHalfPrecision = false;
+      continue;
+    }
     if (Feature == "+atomics") {
       HasAtomics = true;
       continue;
diff --git a/clang/lib/Basic/Targets/WebAssembly.h b/clang/lib/Basic/Targets/WebAssembly.h
index 5568aa28eaef..e4c18879182e 100644
--- a/clang/lib/Basic/Targets/WebAssembly.h
+++ b/clang/lib/Basic/Targets/WebAssembly.h
@@ -64,6 +64,7 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyTargetInfo : public TargetInfo {
   bool HasReferenceTypes = false;
   bool HasExtendedConst = false;
   bool HasMultiMemory = false;
+  bool HasHalfPrecision = false;
 
   std::string ABI;
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d08ab5391489..a370734e00d3 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -3885,9 +3885,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   }
 
   case Builtin::BI__builtin_reduce_max: {
-    auto GetIntrinsicID = [](QualType QT) {
+    auto GetIntrinsicID = [this](QualType QT) {
       if (auto *VecTy = QT->getAs<VectorType>())
         QT = VecTy->getElementType();
+      else if (QT->isSizelessVectorType())
+        QT = QT->getSizelessVectorEltType(CGM.getContext());
+
       if (QT->isSignedIntegerType())
         return llvm::Intrinsic::vector_reduce_smax;
       if (QT->isUnsignedIntegerType())
@@ -3900,9 +3903,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   }
 
   case Builtin::BI__builtin_reduce_min: {
-    auto GetIntrinsicID = [](QualType QT) {
+    auto GetIntrinsicID = [this](QualType QT) {
       if (auto *VecTy = QT->getAs<VectorType>())
         QT = VecTy->getElementType();
+      else if (QT->isSizelessVectorType())
+        QT = QT->getSizelessVectorEltType(CGM.getContext());
+
       if (QT->isSignedIntegerType())
         return llvm::Intrinsic::vector_reduce_smin;
       if (QT->isUnsignedIntegerType())
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index d2d92140b6b2..69548902dc43 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -4698,11 +4698,11 @@ void CodeGenFunction::EmitCallArg(CallArgList &args, const Expr *E,
     AggValueSlot Slot = args.isUsingInAlloca()
         ? createPlaceholderSlot(*this, type) : CreateAggTemp(type, "agg.tmp");
 
-    bool DestroyedInCallee = true, NeedsEHCleanup = true;
+    bool DestroyedInCallee = true, NeedsCleanup = true;
     if (const auto *RD = type->getAsCXXRecordDecl())
       DestroyedInCallee = RD->hasNonTrivialDestructor();
     else
-      NeedsEHCleanup = needsEHCleanup(type.isDestructedType());
+      NeedsCleanup = type.isDestructedType();
 
     if (DestroyedInCallee)
       Slot.setExternallyDestructed();
@@ -4711,14 +4711,15 @@ void CodeGenFunction::EmitCallArg(CallArgList &args, const Expr *E,
     RValue RV = Slot.asRValue();
     args.add(RV, type);
 
-    if (DestroyedInCallee && NeedsEHCleanup) {
+    if (DestroyedInCallee && NeedsCleanup) {
       // Create a no-op GEP between the placeholder and the cleanup so we can
       // RAUW it successfully.  It also serves as a marker of the first
       // instruction where the cleanup is active.
-      pushFullExprCleanup<DestroyUnpassedArg>(EHCleanup, Slot.getAddress(),
-                                              type);
+      pushFullExprCleanup<DestroyUnpassedArg>(NormalAndEHCleanup,
+                                              Slot.getAddress(), type);
       // This unreachable is a temporary marker which will be removed later.
-      llvm::Instruction *IsActive = Builder.CreateUnreachable();
+      llvm::Instruction *IsActive =
+          Builder.CreateFlagLoad(llvm::Constant::getNullValue(Int8PtrTy));
       args.addArgCleanupDeactivation(EHStack.stable_begin(), IsActive);
     }
     return;
diff --git a/clang/lib/CodeGen/CGCleanup.cpp b/clang/lib/CodeGen/CGCleanup.cpp
index e6f8e6873004..469e0363b744 100644
--- a/clang/lib/CodeGen/CGCleanup.cpp
+++ b/clang/lib/CodeGen/CGCleanup.cpp
@@ -634,12 +634,19 @@ static void destroyOptimisticNormalEntry(CodeGenFunction &CGF,
 /// Pops a cleanup block.  If the block includes a normal cleanup, the
 /// current insertion point is threaded through the cleanup, as are
 /// any branch fixups on the cleanup.
-void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
+void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough,
+                                      bool ForDeactivation) {
   assert(!EHStack.empty() && "cleanup stack is empty!");
   assert(isa<EHCleanupScope>(*EHStack.begin()) && "top not a cleanup!");
   EHCleanupScope &Scope = cast<EHCleanupScope>(*EHStack.begin());
   assert(Scope.getFixupDepth() <= EHStack.getNumBranchFixups());
 
+  // If we are deactivating a normal cleanup, we need to pretend that the
+  // fallthrough is unreachable. We restore this IP before returning.
+  CGBuilderTy::InsertPoint NormalDeactivateOrigIP;
+  if (ForDeactivation && (Scope.isNormalCleanup() || !getLangOpts().EHAsynch)) {
+    NormalDeactivateOrigIP = Builder.saveAndClearIP();
+  }
   // Remember activation information.
   bool IsActive = Scope.isActive();
   Address NormalActiveFlag =
@@ -667,7 +674,8 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
 
   // - whether there's a fallthrough
   llvm::BasicBlock *FallthroughSource = Builder.GetInsertBlock();
-  bool HasFallthrough = (FallthroughSource != nullptr && IsActive);
+  bool HasFallthrough =
+      FallthroughSource != nullptr && (IsActive || HasExistingBranches);
 
   // Branch-through fall-throughs leave the insertion point set to the
   // end of the last cleanup, which points to the current scope.  The
@@ -692,7 +700,11 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
 
   // If we have a prebranched fallthrough into an inactive normal
   // cleanup, rewrite it so that it leads to the appropriate place.
-  if (Scope.isNormalCleanup() && HasPrebranchedFallthrough && !IsActive) {
+  if (Scope.isNormalCleanup() && HasPrebranchedFallthrough &&
+      !RequiresNormalCleanup) {
+    // FIXME: Come up with a program which would need forwarding prebranched
+    // fallthrough and add tests. Otherwise delete this and assert against it.
+    assert(!IsActive);
     llvm::BasicBlock *prebranchDest;
 
     // If the prebranch is semantically branching through the next
@@ -724,6 +736,8 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
     EHStack.popCleanup(); // safe because there are no fixups
     assert(EHStack.getNumBranchFixups() == 0 ||
            EHStack.hasNormalCleanups());
+    if (NormalDeactivateOrigIP.isSet())
+      Builder.restoreIP(NormalDeactivateOrigIP);
     return;
   }
 
@@ -760,11 +774,19 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
   if (!RequiresNormalCleanup) {
     // Mark CPP scope end for passed-by-value Arg temp
     //   per Windows ABI which is "normally" Cleanup in callee
-    if (IsEHa && getInvokeDest() && Builder.GetInsertBlock()) {
-      if (Personality.isMSVCXXPersonality())
+    if (IsEHa && getInvokeDest()) {
+      // If we are deactivating a normal cleanup then we don't have a
+      // fallthrough. Restore original IP to emit CPP scope ends in the correct
+      // block.
+      if (NormalDeactivateOrigIP.isSet())
+        Builder.restoreIP(NormalDeactivateOrigIP);
+      if (Personality.isMSVCXXPersonality() && Builder.GetInsertBlock())
         EmitSehCppScopeEnd();
+      if (NormalDeactivateOrigIP.isSet())
+        NormalDeactivateOrigIP = Builder.saveAndClearIP();
     }
     destroyOptimisticNormalEntry(*this, Scope);
+    Scope.MarkEmitted();
     EHStack.popCleanup();
   } else {
     // If we have a fallthrough and no other need for the cleanup,
@@ -781,6 +803,7 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
       }
 
       destroyOptimisticNormalEntry(*this, Scope);
+      Scope.MarkEmitted();
       EHStack.popCleanup();
 
       EmitCleanup(*this, Fn, cleanupFlags, NormalActiveFlag);
@@ -916,6 +939,7 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
       }
 
       // IV.  Pop the cleanup and emit it.
+      Scope.MarkEmitted();
       EHStack.popCleanup();
       assert(EHStack.hasNormalCleanups() == HasEnclosingCleanups);
 
@@ -984,6 +1008,8 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
     }
   }
 
+  if (NormalDeactivateOrigIP.isSet())
+    Builder.restoreIP(NormalDeactivateOrigIP);
   assert(EHStack.hasNormalCleanups() || EHStack.getNumBranchFixups() == 0);
 
   // Emit the EH cleanup if required.
@@ -1143,25 +1169,6 @@ void CodeGenFunction::EmitBranchThroughCleanup(JumpDest Dest) {
   Builder.ClearInsertionPoint();
 }
 
-static bool IsUsedAsNormalCleanup(EHScopeStack &EHStack,
-                                  EHScopeStack::stable_iterator C) {
-  // If we needed a normal block for any reason, that counts.
-  if (cast<EHCleanupScope>(*EHStack.find(C)).getNormalBlock())
-    return true;
-
-  // Check whether any enclosed cleanups were needed.
-  for (EHScopeStack::stable_iterator
-         I = EHStack.getInnermostNormalCleanup();
-         I != C; ) {
-    assert(C.strictlyEncloses(I));
-    EHCleanupScope &S = cast<EHCleanupScope>(*EHStack.find(I));
-    if (S.getNormalBlock()) return true;
-    I = S.getEnclosingNormalCleanup();
-  }
-
-  return false;
-}
-
 static bool IsUsedAsEHCleanup(EHScopeStack &EHStack,
                               EHScopeStack::stable_iterator cleanup) {
   // If we needed an EH block for any reason, that counts.
@@ -1210,8 +1217,7 @@ static void SetupCleanupBlockActivation(CodeGenFunction &CGF,
   // Calculate whether the cleanup was used:
 
   //   - as a normal cleanup
-  if (Scope.isNormalCleanup() &&
-      (isActivatedInConditional || IsUsedAsNormalCleanup(CGF.EHStack, C))) {
+  if (Scope.isNormalCleanup()) {
     Scope.setTestFlagInNormalCleanup();
     needFlag = true;
   }
@@ -1224,13 +1230,16 @@ static void SetupCleanupBlockActivation(CodeGenFunction &CGF,
   }
 
   // If it hasn't yet been used as either, we're done.
-  if (!needFlag) return;
+  if (!needFlag)
+    return;
 
   Address var = Scope.getActiveFlag();
   if (!var.isValid()) {
+    CodeGenFunction::AllocaTrackerRAII AllocaTracker(CGF);
     var = CGF.CreateTempAlloca(CGF.Builder.getInt1Ty(), CharUnits::One(),
                                "cleanup.isactive");
     Scope.setActiveFlag(var);
+    Scope.AddAuxAllocas(AllocaTracker.Take());
 
     assert(dominatingIP && "no existing variable and no dominating IP!");
 
@@ -1273,17 +1282,8 @@ void CodeGenFunction::DeactivateCleanupBlock(EHScopeStack::stable_iterator C,
   // to the current RunCleanupsScope.
   if (C == EHStack.stable_begin() &&
       CurrentCleanupScopeDepth.strictlyEncloses(C)) {
-    // Per comment below, checking EHAsynch is not really necessary
-    // it's there to assure zero-impact w/o EHAsynch option
-    if (!Scope.isNormalCleanup() && getLangOpts().EHAsynch) {
-      PopCleanupBlock();
-    } else {
-      // If it's a normal cleanup, we need to pretend that the
-      // fallthrough is unreachable.
-      CGBuilderTy::InsertPoint SavedIP = Builder.saveAndClearIP();
-      PopCleanupBlock();
-      Builder.restoreIP(SavedIP);
-    }
+    PopCleanupBlock(/*FallthroughIsBranchThrough=*/false,
+                    /*ForDeactivation=*/true);
     return;
   }
 
diff --git a/clang/lib/CodeGen/CGCleanup.h b/clang/lib/CodeGen/CGCleanup.h
index 03e4a29d7b3d..c73c97146abc 100644
--- a/clang/lib/CodeGen/CGCleanup.h
+++ b/clang/lib/CodeGen/CGCleanup.h
@@ -16,8 +16,11 @@
 #include "EHScopeStack.h"
 
 #include "Address.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Instruction.h"
 
 namespace llvm {
 class BasicBlock;
@@ -266,6 +269,51 @@ class alignas(8) EHCleanupScope : public EHScope {
   };
   mutable struct ExtInfo *ExtInfo;
 
+  /// Erases auxillary allocas and their usages for an unused cleanup.
+  /// Cleanups should mark these allocas as 'used' if the cleanup is
+  /// emitted, otherwise these instructions would be erased.
+  struct AuxillaryAllocas {
+    SmallVector<llvm::Instruction *, 1> AuxAllocas;
+    bool used = false;
+
+    // Records a potentially unused instruction to be erased later.
+    void Add(llvm::AllocaInst *Alloca) { AuxAllocas.push_back(Alloca); }
+
+    // Mark all recorded instructions as used. These will not be erased later.
+    void MarkUsed() {
+      used = true;
+      AuxAllocas.clear();
+    }
+
+    ~AuxillaryAllocas() {
+      if (used)
+        return;
+      llvm::SetVector<llvm::Instruction *> Uses;
+      for (auto *Inst : llvm::reverse(AuxAllocas))
+        CollectUses(Inst, Uses);
+      // Delete uses in the reverse order of insertion.
+      for (auto *I : llvm::reverse(Uses))
+        I->eraseFromParent();
+    }
+
+  private:
+    void CollectUses(llvm::Instruction *I,
+                     llvm::SetVector<llvm::Instruction *> &Uses) {
+      if (!I || !Uses.insert(I))
+        return;
+      for (auto *User : I->users())
+        CollectUses(cast<llvm::Instruction>(User), Uses);
+    }
+  };
+  mutable struct AuxillaryAllocas *AuxAllocas;
+
+  AuxillaryAllocas &getAuxillaryAllocas() {
+    if (!AuxAllocas) {
+      AuxAllocas = new struct AuxillaryAllocas();
+    }
+    return *AuxAllocas;
+  }
+
   /// The number of fixups required by enclosing scopes (not including
   /// this one).  If this is the top cleanup scope, all the fixups
   /// from this index onwards belong to this scope.
@@ -298,7 +346,7 @@ public:
                  EHScopeStack::stable_iterator enclosingEH)
       : EHScope(EHScope::Cleanup, enclosingEH),
         EnclosingNormal(enclosingNormal), NormalBlock(nullptr),
-        ActiveFlag(Address::invalid()), ExtInfo(nullptr),
+        ActiveFlag(Address::invalid()), ExtInfo(nullptr), AuxAllocas(nullptr),
         FixupDepth(fixupDepth) {
     CleanupBits.IsNormalCleanup = isNormal;
     CleanupBits.IsEHCleanup = isEH;
@@ -312,8 +360,15 @@ public:
   }
 
   void Destroy() {
+    if (AuxAllocas)
+      delete AuxAllocas;
     delete ExtInfo;
   }
+  void AddAuxAllocas(llvm::SmallVector<llvm::AllocaInst *> Allocas) {
+    for (auto *Alloca : Allocas)
+      getAuxillaryAllocas().Add(Alloca);
+  }
+  void MarkEmitted() { getAuxillaryAllocas().MarkUsed(); }
   // Objects of EHCleanupScope are not destructed. Use Destroy().
   ~EHCleanupScope() = delete;
 
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index ce6d6d895607..9cc67cdbe424 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -19,6 +19,7 @@
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "ConstantEmitter.h"
+#include "EHScopeStack.h"
 #include "PatternInit.h"
 #include "TargetInfo.h"
 #include "clang/AST/ASTContext.h"
@@ -35,6 +36,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
 #include <optional>
@@ -2201,6 +2203,27 @@ void CodeGenFunction::pushDestroy(CleanupKind cleanupKind, Address addr,
                                      destroyer, useEHCleanupForArray);
 }
 
+// Pushes a destroy and defers its deactivation until its
+// CleanupDeactivationScope is exited.
+void CodeGenFunction::pushDestroyAndDeferDeactivation(
+    QualType::DestructionKind dtorKind, Address addr, QualType type) {
+  assert(dtorKind && "cannot push destructor for trivial type");
+
+  CleanupKind cleanupKind = getCleanupKind(dtorKind);
+  pushDestroyAndDeferDeactivation(
+      cleanupKind, addr, type, getDestroyer(dtorKind), cleanupKind & EHCleanup);
+}
+
+void CodeGenFunction::pushDestroyAndDeferDeactivation(
+    CleanupKind cleanupKind, Address addr, QualType type, Destroyer *destroyer,
+    bool useEHCleanupForArray) {
+  llvm::Instruction *DominatingIP =
+      Builder.CreateFlagLoad(llvm::Constant::getNullValue(Int8PtrTy));
+  pushDestroy(cleanupKind, addr, type, destroyer, useEHCleanupForArray);
+  DeferredDeactivationCleanupStack.push_back(
+      {EHStack.stable_begin(), DominatingIP});
+}
+
 void CodeGenFunction::pushStackRestore(CleanupKind Kind, Address SPMem) {
   EHStack.pushCleanup<CallStackRestore>(Kind, SPMem);
 }
@@ -2217,39 +2240,48 @@ void CodeGenFunction::pushLifetimeExtendedDestroy(CleanupKind cleanupKind,
   // If we're not in a conditional branch, we don't need to bother generating a
   // conditional cleanup.
   if (!isInConditionalBranch()) {
-    // Push an EH-only cleanup for the object now.
     // FIXME: When popping normal cleanups, we need to keep this EH cleanup
     // around in case a temporary's destructor throws an exception.
-    if (cleanupKind & EHCleanup)
-      EHStack.pushCleanup<DestroyObject>(
-          static_cast<CleanupKind>(cleanupKind & ~NormalCleanup), addr, type,
-          destroyer, useEHCleanupForArray);
 
+    // Add the cleanup to the EHStack. After the full-expr, this would be
+    // deactivated before being popped from the stack.
+    pushDestroyAndDeferDeactivation(cleanupKind, addr, type, destroyer,
+                                    useEHCleanupForArray);
+
+    // Since this is lifetime-extended, push it once again to the EHStack after
+    // the full expression.
     return pushCleanupAfterFullExprWithActiveFlag<DestroyObject>(
-        cleanupKind, Address::invalid(), addr, type, destroyer, useEHCleanupForArray);
+        cleanupKind, Address::invalid(), addr, type, destroyer,
+        useEHCleanupForArray);
   }
 
   // Otherwise, we should only destroy the object if it's been initialized.
-  // Re-use the active flag and saved address across both the EH and end of
-  // scope cleanups.
 
-  using SavedType = typename DominatingValue<Address>::saved_type;
   using ConditionalCleanupType =
       EHScopeStack::ConditionalCleanup<DestroyObject, Address, QualType,
                                        Destroyer *, bool>;
-
-  Address ActiveFlag = createCleanupActiveFlag();
-  SavedType SavedAddr = saveValueInCond(addr);
-
-  if (cleanupKind & EHCleanup) {
-    EHStack.pushCleanup<ConditionalCleanupType>(
-        static_cast<CleanupKind>(cleanupKind & ~NormalCleanup), SavedAddr, type,
-        destroyer, useEHCleanupForArray);
-    initFullExprCleanupWithFlag(ActiveFlag);
-  }
-
+  DominatingValue<Address>::saved_type SavedAddr = saveValueInCond(addr);
+
+  // Remember to emit cleanup if we branch-out before end of full-expression
+  // (eg: through stmt-expr or coro suspensions).
+  AllocaTrackerRAII DeactivationAllocas(*this);
+  Address ActiveFlagForDeactivation = createCleanupActiveFlag();
+
+  pushCleanupAndDeferDeactivation<ConditionalCleanupType>(
+      cleanupKind, SavedAddr, type, destroyer, useEHCleanupForArray);
+  initFullExprCleanupWithFlag(ActiveFlagForDeactivation);
+  EHCleanupScope &cleanup = cast<EHCleanupScope>(*EHStack.begin());
+  // Erase the active flag if the cleanup was not emitted.
+  cleanup.AddAuxAllocas(std::move(DeactivationAllocas).Take());
+
+  // Since this is lifetime-extended, push it once again to the EHStack after
+  // the full expression.
+  // The previous active flag would always be 'false' due to forced deferred
+  // deactivation. Use a separate flag for lifetime-extension to correctly
+  // remember if this branch was taken and the object was initialized.
+  Address ActiveFlagForLifetimeExt = createCleanupActiveFlag();
   pushCleanupAfterFullExprWithActiveFlag<ConditionalCleanupType>(
-      cleanupKind, ActiveFlag, SavedAddr, type, destroyer,
+      cleanupKind, ActiveFlagForLifetimeExt, SavedAddr, type, destroyer,
       useEHCleanupForArray);
 }
 
@@ -2442,9 +2474,9 @@ namespace {
   };
 } // end anonymous namespace
 
-/// pushIrregularPartialArrayCleanup - Push an EH cleanup to destroy
-/// already-constructed elements of the given array.  The cleanup
-/// may be popped with DeactivateCleanupBlock or PopCleanupBlock.
+/// pushIrregularPartialArrayCleanup - Push a NormalAndEHCleanup to
+/// destroy already-constructed elements of the given array.  The cleanup may be
+/// popped with DeactivateCleanupBlock or PopCleanupBlock.
 ///
 /// \param elementType - the immediate element type of the array;
 ///   possibly still an array type
@@ -2453,10 +2485,9 @@ void CodeGenFunction::pushIrregularPartialArrayCleanup(llvm::Value *arrayBegin,
                                                        QualType elementType,
                                                        CharUnits elementAlign,
                                                        Destroyer *destroyer) {
-  pushFullExprCleanup<IrregularPartialArrayDestroy>(EHCleanup,
-                                                    arrayBegin, arrayEndPointer,
-                                                    elementType, elementAlign,
-                                                    destroyer);
+  pushFullExprCleanup<IrregularPartialArrayDestroy>(
+      NormalAndEHCleanup, arrayBegin, arrayEndPointer, elementType,
+      elementAlign, destroyer);
 }
 
 /// pushRegularPartialArrayCleanup - Push an EH cleanup to destroy
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index c94322f51e46..d96c7bb1e568 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -115,10 +115,16 @@ RawAddress CodeGenFunction::CreateTempAlloca(llvm::Type *Ty, CharUnits Align,
 llvm::AllocaInst *CodeGenFunction::CreateTempAlloca(llvm::Type *Ty,
                                                     const Twine &Name,
                                                     llvm::Value *ArraySize) {
+  llvm::AllocaInst *Alloca;
   if (ArraySize)
-    return Builder.CreateAlloca(Ty, ArraySize, Name);
-  return new llvm::AllocaInst(Ty, CGM.getDataLayout().getAllocaAddrSpace(),
-                              ArraySize, Name, AllocaInsertPt);
+    Alloca = Builder.CreateAlloca(Ty, ArraySize, Name);
+  else
+    Alloca = new llvm::AllocaInst(Ty, CGM.getDataLayout().getAllocaAddrSpace(),
+                                  ArraySize, Name, AllocaInsertPt);
+  if (Allocas) {
+    Allocas->Add(Alloca);
+  }
+  return Alloca;
 }
 
 /// CreateDefaultAlignTempAlloca - This creates an alloca with the
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 355fec42be44..44d476976a55 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -15,6 +15,7 @@
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "ConstantEmitter.h"
+#include "EHScopeStack.h"
 #include "TargetInfo.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Attr.h"
@@ -24,6 +25,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 using namespace clang;
@@ -558,24 +560,27 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, llvm::ArrayType *AType,
   // For that, we'll need an EH cleanup.
   QualType::DestructionKind dtorKind = elementType.isDestructedType();
   Address endOfInit = Address::invalid();
-  EHScopeStack::stable_iterator cleanup;
-  llvm::Instruction *cleanupDominator = nullptr;
-  if (CGF.needsEHCleanup(dtorKind)) {
+  CodeGenFunction::CleanupDeactivationScope deactivation(CGF);
+
+  if (dtorKind) {
+    CodeGenFunction::AllocaTrackerRAII allocaTracker(CGF);
     // In principle we could tell the cleanup where we are more
     // directly, but the control flow can get so varied here that it
     // would actually be quite complex.  Therefore we go through an
     // alloca.
+    llvm::Instruction *dominatingIP =
+        Builder.CreateFlagLoad(llvm::ConstantInt::getNullValue(CGF.Int8PtrTy));
     endOfInit = CGF.CreateTempAlloca(begin->getType(), CGF.getPointerAlign(),
                                      "arrayinit.endOfInit");
-    cleanupDominator = Builder.CreateStore(begin, endOfInit);
+    Builder.CreateStore(begin, endOfInit);
     CGF.pushIrregularPartialArrayCleanup(begin, endOfInit, elementType,
                                          elementAlign,
                                          CGF.getDestroyer(dtorKind));
-    cleanup = CGF.EHStack.stable_begin();
+    cast<EHCleanupScope>(*CGF.EHStack.find(CGF.EHStack.stable_begin()))
+        .AddAuxAllocas(allocaTracker.Take());
 
-  // Otherwise, remember that we didn't need a cleanup.
-  } else {
-    dtorKind = QualType::DK_none;
+    CGF.DeferredDeactivationCleanupStack.push_back(
+        {CGF.EHStack.stable_begin(), dominatingIP});
   }
 
   llvm::Value *one = llvm::ConstantInt::get(CGF.SizeTy, 1);
@@ -671,9 +676,6 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, llvm::ArrayType *AType,
 
     CGF.EmitBlock(endBB);
   }
-
-  // Leave the partial-array cleanup if we entered one.
-  if (dtorKind) CGF.DeactivateCleanupBlock(cleanup, cleanupDominator);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1374,9 +1376,8 @@ AggExprEmitter::VisitLambdaExpr(LambdaExpr *E) {
   LValue SlotLV = CGF.MakeAddrLValue(Slot.getAddress(), E->getType());
 
   // We'll need to enter cleanup scopes in case any of the element
-  // initializers throws an exception.
-  SmallVector<EHScopeStack::stable_iterator, 16> Cleanups;
-  llvm::Instruction *CleanupDominator = nullptr;
+  // initializers throws an exception or contains branch out of the expressions.
+  CodeGenFunction::CleanupDeactivationScope scope(CGF);
 
   CXXRecordDecl::field_iterator CurField = E->getLambdaClass()->field_begin();
   for (LambdaExpr::const_capture_init_iterator i = E->capture_init_begin(),
@@ -1395,28 +1396,12 @@ AggExprEmitter::VisitLambdaExpr(LambdaExpr *E) {
     if (QualType::DestructionKind DtorKind =
             CurField->getType().isDestructedType()) {
       assert(LV.isSimple());
-      if (CGF.needsEHCleanup(DtorKind)) {
-        if (!CleanupDominator)
-          CleanupDominator = CGF.Builder.CreateAlignedLoad(
-              CGF.Int8Ty,
-              llvm::Constant::getNullValue(CGF.Int8PtrTy),
-              CharUnits::One()); // placeholder
-
-        CGF.pushDestroy(EHCleanup, LV.getAddress(CGF), CurField->getType(),
-                        CGF.getDestroyer(DtorKind), false);
-        Cleanups.push_back(CGF.EHStack.stable_begin());
-      }
+      if (DtorKind)
+        CGF.pushDestroyAndDeferDeactivation(
+            NormalAndEHCleanup, LV.getAddress(CGF), CurField->getType(),
+            CGF.getDestroyer(DtorKind), false);
     }
   }
-
-  // Deactivate all the partial cleanups in reverse order, which
-  // generally means popping them.
-  for (unsigned i = Cleanups.size(); i != 0; --i)
-    CGF.DeactivateCleanupBlock(Cleanups[i-1], CleanupDominator);
-
-  // Destroy the placeholder if we made one.
-  if (CleanupDominator)
-    CleanupDominator->eraseFromParent();
 }
 
 void AggExprEmitter::VisitExprWithCleanups(ExprWithCleanups *E) {
@@ -1705,14 +1690,7 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
   // We'll need to enter cleanup scopes in case any of the element
   // initializers throws an exception.
   SmallVector<EHScopeStack::stable_iterator, 16> cleanups;
-  llvm::Instruction *cleanupDominator = nullptr;
-  auto addCleanup = [&](const EHScopeStack::stable_iterator &cleanup) {
-    cleanups.push_back(cleanup);
-    if (!cleanupDominator) // create placeholder once needed
-      cleanupDominator = CGF.Builder.CreateAlignedLoad(
-          CGF.Int8Ty, llvm::Constant::getNullValue(CGF.Int8PtrTy),
-          CharUnits::One());
-  };
+  CodeGenFunction::CleanupDeactivationScope DeactivateCleanups(CGF);
 
   unsigned curInitIndex = 0;
 
@@ -1735,10 +1713,8 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
       CGF.EmitAggExpr(InitExprs[curInitIndex++], AggSlot);
 
       if (QualType::DestructionKind dtorKind =
-              Base.getType().isDestructedType()) {
-        CGF.pushDestroy(dtorKind, V, Base.getType());
-        addCleanup(CGF.EHStack.stable_begin());
-      }
+              Base.getType().isDestructedType())
+        CGF.pushDestroyAndDeferDeactivation(dtorKind, V, Base.getType());
     }
   }
 
@@ -1815,10 +1791,10 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
     if (QualType::DestructionKind dtorKind
           = field->getType().isDestructedType()) {
       assert(LV.isSimple());
-      if (CGF.needsEHCleanup(dtorKind)) {
-        CGF.pushDestroy(EHCleanup, LV.getAddress(CGF), field->getType(),
-                        CGF.getDestroyer(dtorKind), false);
-        addCleanup(CGF.EHStack.stable_begin());
+      if (dtorKind) {
+        CGF.pushDestroyAndDeferDeactivation(
+            NormalAndEHCleanup, LV.getAddress(CGF), field->getType(),
+            CGF.getDestroyer(dtorKind), false);
         pushedCleanup = true;
       }
     }
@@ -1831,17 +1807,6 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
         if (GEP->use_empty())
           GEP->eraseFromParent();
   }
-
-  // Deactivate all the partial cleanups in reverse order, which
-  // generally means popping them.
-  assert((cleanupDominator || cleanups.empty()) &&
-         "Missing cleanupDominator before deactivating cleanup blocks");
-  for (unsigned i = cleanups.size(); i != 0; --i)
-    CGF.DeactivateCleanupBlock(cleanups[i-1], cleanupDominator);
-
-  // Destroy the placeholder if we made one.
-  if (cleanupDominator)
-    cleanupDominator->eraseFromParent();
 }
 
 void AggExprEmitter::VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E,
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index 673ccef84d67..c18c36d3f3f3 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -1008,8 +1008,8 @@ void CodeGenFunction::EmitNewArrayInitializer(
   const Expr *Init = E->getInitializer();
   Address EndOfInit = Address::invalid();
   QualType::DestructionKind DtorKind = ElementType.isDestructedType();
-  EHScopeStack::stable_iterator Cleanup;
-  llvm::Instruction *CleanupDominator = nullptr;
+  CleanupDeactivationScope deactivation(*this);
+  bool pushedCleanup = false;
 
   CharUnits ElementSize = getContext().getTypeSizeInChars(ElementType);
   CharUnits ElementAlign =
@@ -1105,19 +1105,24 @@ void CodeGenFunction::EmitNewArrayInitializer(
     }
 
     // Enter a partial-destruction Cleanup if necessary.
-    if (needsEHCleanup(DtorKind)) {
+    if (DtorKind) {
+      AllocaTrackerRAII AllocaTracker(*this);
       // In principle we could tell the Cleanup where we are more
       // directly, but the control flow can get so varied here that it
       // would actually be quite complex.  Therefore we go through an
       // alloca.
+      llvm::Instruction *DominatingIP =
+          Builder.CreateFlagLoad(llvm::ConstantInt::getNullValue(Int8PtrTy));
       EndOfInit = CreateTempAlloca(BeginPtr.getType(), getPointerAlign(),
                                    "array.init.end");
-      CleanupDominator =
-          Builder.CreateStore(BeginPtr.emitRawPointer(*this), EndOfInit);
       pushIrregularPartialArrayCleanup(BeginPtr.emitRawPointer(*this),
                                        EndOfInit, ElementType, ElementAlign,
                                        getDestroyer(DtorKind));
-      Cleanup = EHStack.stable_begin();
+      cast<EHCleanupScope>(*EHStack.find(EHStack.stable_begin()))
+          .AddAuxAllocas(AllocaTracker.Take());
+      DeferredDeactivationCleanupStack.push_back(
+          {EHStack.stable_begin(), DominatingIP});
+      pushedCleanup = true;
     }
 
     CharUnits StartAlign = CurPtr.getAlignment();
@@ -1164,9 +1169,6 @@ void CodeGenFunction::EmitNewArrayInitializer(
   // initialization.
   llvm::ConstantInt *ConstNum = dyn_cast<llvm::ConstantInt>(NumElements);
   if (ConstNum && ConstNum->getZExtValue() <= InitListElements) {
-    // If there was a Cleanup, deactivate it.
-    if (CleanupDominator)
-      DeactivateCleanupBlock(Cleanup, CleanupDominator);
     return;
   }
 
@@ -1281,13 +1283,14 @@ void CodeGenFunction::EmitNewArrayInitializer(
     Builder.CreateStore(CurPtr.emitRawPointer(*this), EndOfInit);
 
   // Enter a partial-destruction Cleanup if necessary.
-  if (!CleanupDominator && needsEHCleanup(DtorKind)) {
-    llvm::Value *BeginPtrRaw = BeginPtr.emitRawPointer(*this);
-    llvm::Value *CurPtrRaw = CurPtr.emitRawPointer(*this);
-    pushRegularPartialArrayCleanup(BeginPtrRaw, CurPtrRaw, ElementType,
+  if (!pushedCleanup && needsEHCleanup(DtorKind)) {
+    llvm::Instruction *DominatingIP =
+        Builder.CreateFlagLoad(llvm::ConstantInt::getNullValue(Int8PtrTy));
+    pushRegularPartialArrayCleanup(BeginPtr.emitRawPointer(*this),
+                                   CurPtr.emitRawPointer(*this), ElementType,
                                    ElementAlign, getDestroyer(DtorKind));
-    Cleanup = EHStack.stable_begin();
-    CleanupDominator = Builder.CreateUnreachable();
+    DeferredDeactivationCleanupStack.push_back(
+        {EHStack.stable_begin(), DominatingIP});
   }
 
   // Emit the initializer into this element.
@@ -1295,10 +1298,7 @@ void CodeGenFunction::EmitNewArrayInitializer(
                           AggValueSlot::DoesNotOverlap);
 
   // Leave the Cleanup if we entered one.
-  if (CleanupDominator) {
-    DeactivateCleanupBlock(Cleanup, CleanupDominator);
-    CleanupDominator->eraseFromParent();
-  }
+  deactivation.ForceDeactivate();
 
   // Advance to the next element by adjusting the pointer type as necessary.
   llvm::Value *NextPtr = Builder.CreateConstInBoundsGEP1_32(
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 40a5cd20c3d7..af48e8d2b839 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2330,7 +2330,7 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     }
 
     // Perform VLAT <-> VLST bitcast through memory.
-    // TODO: since the llvm.experimental.vector.{insert,extract} intrinsics
+    // TODO: since the llvm.vector.{insert,extract} intrinsics
     //       require the element types of the vectors to be the same, we
     //       need to keep this around for bitcasts between VLAT <-> VLST where
     //       the element types of the vectors are not the same, until we figure
diff --git a/clang/lib/CodeGen/CGLoopInfo.cpp b/clang/lib/CodeGen/CGLoopInfo.cpp
index 72d1471021ac..0d4800b90a2f 100644
--- a/clang/lib/CodeGen/CGLoopInfo.cpp
+++ b/clang/lib/CodeGen/CGLoopInfo.cpp
@@ -673,8 +673,6 @@ void LoopInfoStack::push(BasicBlock *Header, clang::ASTContext &Ctx,
         setPipelineDisabled(true);
         break;
       case LoopHintAttr::UnrollCount:
-        setUnrollState(LoopAttributes::Disable);
-        break;
       case LoopHintAttr::UnrollAndJamCount:
       case LoopHintAttr::VectorizeWidth:
       case LoopHintAttr::InterleaveCount:
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 86a6ddd80cc1..87766a758311 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -91,6 +91,8 @@ CodeGenFunction::CodeGenFunction(CodeGenModule &cgm, bool suppressNewContext)
 
 CodeGenFunction::~CodeGenFunction() {
   assert(LifetimeExtendedCleanupStack.empty() && "failed to emit a cleanup");
+  assert(DeferredDeactivationCleanupStack.empty() &&
+         "missed to deactivate a cleanup");
 
   if (getLangOpts().OpenMP && CurFn)
     CGM.getOpenMPRuntime().functionFinished(*this);
@@ -346,6 +348,10 @@ static void EmitIfUsed(CodeGenFunction &CGF, llvm::BasicBlock *BB) {
 void CodeGenFunction::FinishFunction(SourceLocation EndLoc) {
   assert(BreakContinueStack.empty() &&
          "mismatched push/pop in break/continue stack!");
+  assert(LifetimeExtendedCleanupStack.empty() &&
+         "mismatched push/pop of cleanups in EHStack!");
+  assert(DeferredDeactivationCleanupStack.empty() &&
+         "mismatched activate/deactivate of cleanups!");
 
   bool OnlySimpleReturnStmts = NumSimpleReturnExprs > 0
     && NumSimpleReturnExprs == NumReturnExprs
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 33fb7a41912b..6e7417fc7f52 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -39,6 +39,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/SanitizerStats.h"
@@ -670,6 +671,51 @@ public:
 
   EHScopeStack EHStack;
   llvm::SmallVector<char, 256> LifetimeExtendedCleanupStack;
+
+  // A stack of cleanups which were added to EHStack but have to be deactivated
+  // later before being popped or emitted. These are usually deactivated on
+  // exiting a `CleanupDeactivationScope` scope. For instance, after a
+  // full-expr.
+  //
+  // These are specially useful for correctly emitting cleanups while
+  // encountering branches out of expression (through stmt-expr or coroutine
+  // suspensions).
+  struct DeferredDeactivateCleanup {
+    EHScopeStack::stable_iterator Cleanup;
+    llvm::Instruction *DominatingIP;
+  };
+  llvm::SmallVector<DeferredDeactivateCleanup> DeferredDeactivationCleanupStack;
+
+  // Enters a new scope for capturing cleanups which are deferred to be
+  // deactivated, all of which will be deactivated once the scope is exited.
+  struct CleanupDeactivationScope {
+    CodeGenFunction &CGF;
+    size_t OldDeactivateCleanupStackSize;
+    bool Deactivated;
+    CleanupDeactivationScope(CodeGenFunction &CGF)
+        : CGF(CGF), OldDeactivateCleanupStackSize(
+                        CGF.DeferredDeactivationCleanupStack.size()),
+          Deactivated(false) {}
+
+    void ForceDeactivate() {
+      assert(!Deactivated && "Deactivating already deactivated scope");
+      auto &Stack = CGF.DeferredDeactivationCleanupStack;
+      for (size_t I = Stack.size(); I > OldDeactivateCleanupStackSize; I--) {
+        CGF.DeactivateCleanupBlock(Stack[I - 1].Cleanup,
+                                   Stack[I - 1].DominatingIP);
+        Stack[I - 1].DominatingIP->eraseFromParent();
+      }
+      Stack.resize(OldDeactivateCleanupStackSize);
+      Deactivated = true;
+    }
+
+    ~CleanupDeactivationScope() {
+      if (Deactivated)
+        return;
+      ForceDeactivate();
+    }
+  };
+
   llvm::SmallVector<const JumpDest *, 2> SEHTryEpilogueStack;
 
   llvm::Instruction *CurrentFuncletPad = nullptr;
@@ -875,6 +921,19 @@ public:
       new (Buffer + sizeof(Header) + sizeof(T)) RawAddress(ActiveFlag);
   }
 
+  // Push a cleanup onto EHStack and deactivate it later. It is usually
+  // deactivated when exiting a `CleanupDeactivationScope` (for example: after a
+  // full expression).
+  template <class T, class... As>
+  void pushCleanupAndDeferDeactivation(CleanupKind Kind, As... A) {
+    // Placeholder dominating IP for this cleanup.
+    llvm::Instruction *DominatingIP =
+        Builder.CreateFlagLoad(llvm::Constant::getNullValue(Int8PtrTy));
+    EHStack.pushCleanup<T>(Kind, A...);
+    DeferredDeactivationCleanupStack.push_back(
+        {EHStack.stable_begin(), DominatingIP});
+  }
+
   /// Set up the last cleanup that was pushed as a conditional
   /// full-expression cleanup.
   void initFullExprCleanup() {
@@ -898,7 +957,8 @@ public:
 
   /// PopCleanupBlock - Will pop the cleanup entry on the stack and
   /// process all branch fixups.
-  void PopCleanupBlock(bool FallThroughIsBranchThrough = false);
+  void PopCleanupBlock(bool FallThroughIsBranchThrough = false,
+                       bool ForDeactivation = false);
 
   /// DeactivateCleanupBlock - Deactivates the given cleanup block.
   /// The block cannot be reactivated.  Pops it if it's the top of the
@@ -926,6 +986,7 @@ public:
   class RunCleanupsScope {
     EHScopeStack::stable_iterator CleanupStackDepth, OldCleanupScopeDepth;
     size_t LifetimeExtendedCleanupStackSize;
+    CleanupDeactivationScope DeactivateCleanups;
     bool OldDidCallStackSave;
   protected:
     bool PerformCleanup;
@@ -940,8 +1001,7 @@ public:
   public:
     /// Enter a new cleanup scope.
     explicit RunCleanupsScope(CodeGenFunction &CGF)
-      : PerformCleanup(true), CGF(CGF)
-    {
+        : DeactivateCleanups(CGF), PerformCleanup(true), CGF(CGF) {
       CleanupStackDepth = CGF.EHStack.stable_begin();
       LifetimeExtendedCleanupStackSize =
           CGF.LifetimeExtendedCleanupStack.size();
@@ -971,6 +1031,7 @@ public:
     void ForceCleanup(std::initializer_list<llvm::Value**> ValuesToReload = {}) {
       assert(PerformCleanup && "Already forced cleanup");
       CGF.DidCallStackSave = OldDidCallStackSave;
+      DeactivateCleanups.ForceDeactivate();
       CGF.PopCleanupBlocks(CleanupStackDepth, LifetimeExtendedCleanupStackSize,
                            ValuesToReload);
       PerformCleanup = false;
@@ -2160,6 +2221,11 @@ public:
                      Address addr, QualType type);
   void pushDestroy(CleanupKind kind, Address addr, QualType type,
                    Destroyer *destroyer, bool useEHCleanupForArray);
+  void pushDestroyAndDeferDeactivation(QualType::DestructionKind dtorKind,
+                                       Address addr, QualType type);
+  void pushDestroyAndDeferDeactivation(CleanupKind cleanupKind, Address addr,
+                                       QualType type, Destroyer *destroyer,
+                                       bool useEHCleanupForArray);
   void pushLifetimeExtendedDestroy(CleanupKind kind, Address addr,
                                    QualType type, Destroyer *destroyer,
                                    bool useEHCleanupForArray);
@@ -2698,6 +2764,33 @@ public:
                             TBAAAccessInfo *TBAAInfo = nullptr);
   LValue EmitLoadOfPointerLValue(Address Ptr, const PointerType *PtrTy);
 
+private:
+  struct AllocaTracker {
+    void Add(llvm::AllocaInst *I) { Allocas.push_back(I); }
+    llvm::SmallVector<llvm::AllocaInst *> Take() { return std::move(Allocas); }
+
+  private:
+    llvm::SmallVector<llvm::AllocaInst *> Allocas;
+  };
+  AllocaTracker *Allocas = nullptr;
+
+public:
+  // Captures all the allocas created during the scope of its RAII object.
+  struct AllocaTrackerRAII {
+    AllocaTrackerRAII(CodeGenFunction &CGF)
+        : CGF(CGF), OldTracker(CGF.Allocas) {
+      CGF.Allocas = &Tracker;
+    }
+    ~AllocaTrackerRAII() { CGF.Allocas = OldTracker; }
+
+    llvm::SmallVector<llvm::AllocaInst *> Take() { return Tracker.Take(); }
+
+  private:
+    CodeGenFunction &CGF;
+    AllocaTracker *OldTracker;
+    AllocaTracker Tracker;
+  };
+
   /// CreateTempAlloca - This creates an alloca and inserts it into the entry
   /// block if \p ArraySize is nullptr, otherwise inserts it at the current
   /// insertion point of the builder. The caller is responsible for setting an
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index b65b96db16bd..fec11c7e716f 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1191,118 +1191,10 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
   return true;
 }
 
-/// Determines if --whole-archive is active in the list of arguments.
-static bool isWholeArchivePresent(const ArgList &Args) {
-  bool WholeArchiveActive = false;
-  for (auto *Arg : Args.filtered(options::OPT_Wl_COMMA)) {
-    if (Arg) {
-      for (StringRef ArgValue : Arg->getValues()) {
-        if (ArgValue == "--whole-archive")
-          WholeArchiveActive = true;
-        if (ArgValue == "--no-whole-archive")
-          WholeArchiveActive = false;
-      }
-    }
-  }
-
-  return WholeArchiveActive;
-}
-
-/// Determine if driver is invoked to create a shared object library (-static)
-static bool isSharedLinkage(const ArgList &Args) {
-  return Args.hasArg(options::OPT_shared);
-}
-
-/// Determine if driver is invoked to create a static object library (-shared)
-static bool isStaticLinkage(const ArgList &Args) {
-  return Args.hasArg(options::OPT_static);
-}
-
-/// Add Fortran runtime libs for MSVC
-static void addFortranRuntimeLibsMSVC(const ArgList &Args,
-                                      llvm::opt::ArgStringList &CmdArgs) {
-  unsigned RTOptionID = options::OPT__SLASH_MT;
-  if (auto *rtl = Args.getLastArg(options::OPT_fms_runtime_lib_EQ)) {
-    RTOptionID = llvm::StringSwitch<unsigned>(rtl->getValue())
-                     .Case("static", options::OPT__SLASH_MT)
-                     .Case("static_dbg", options::OPT__SLASH_MTd)
-                     .Case("dll", options::OPT__SLASH_MD)
-                     .Case("dll_dbg", options::OPT__SLASH_MDd)
-                     .Default(options::OPT__SLASH_MT);
-  }
-  switch (RTOptionID) {
-  case options::OPT__SLASH_MT:
-    CmdArgs.push_back("/WHOLEARCHIVE:Fortran_main.static.lib");
-    break;
-  case options::OPT__SLASH_MTd:
-    CmdArgs.push_back("/WHOLEARCHIVE:Fortran_main.static_dbg.lib");
-    break;
-  case options::OPT__SLASH_MD:
-    CmdArgs.push_back("/WHOLEARCHIVE:Fortran_main.dynamic.lib");
-    break;
-  case options::OPT__SLASH_MDd:
-    CmdArgs.push_back("/WHOLEARCHIVE:Fortran_main.dynamic_dbg.lib");
-    break;
-  }
-}
-
-// Add FortranMain runtime lib
-static void addFortranMain(const ToolChain &TC, const ArgList &Args,
-                           llvm::opt::ArgStringList &CmdArgs) {
-  // 0. Shared-library linkage
-  // If we are attempting to link a library, we should not add
-  // -lFortran_main.a to the link line, as the `main` symbol is not
-  // required for a library and should also be provided by one of
-  // the translation units of the code that this shared library
-  // will be linked against eventually.
-  if (isSharedLinkage(Args) || isStaticLinkage(Args)) {
-    return;
-  }
-
-  // 1. MSVC
-  if (TC.getTriple().isKnownWindowsMSVCEnvironment()) {
-    addFortranRuntimeLibsMSVC(Args, CmdArgs);
-    return;
-  }
-
-  // 2. GNU and similar
-  const Driver &D = TC.getDriver();
-  const char *FortranMainLinkFlag = "-lFortran_main";
-
-  // Warn if the user added `-lFortran_main` - this library is an implementation
-  // detail of Flang and should be handled automaticaly by the driver.
-  for (const char *arg : CmdArgs) {
-    if (strncmp(arg, FortranMainLinkFlag, strlen(FortranMainLinkFlag)) == 0)
-      D.Diag(diag::warn_drv_deprecated_custom)
-          << FortranMainLinkFlag
-          << "see the Flang driver documentation for correct usage";
-  }
-
-  // The --whole-archive option needs to be part of the link line to make
-  // sure that the main() function from Fortran_main.a is pulled in by the
-  // linker. However, it shouldn't be used if it's already active.
-  // TODO: Find an equivalent of `--whole-archive` for Darwin and AIX.
-  if (!isWholeArchivePresent(Args) && !TC.getTriple().isMacOSX() &&
-      !TC.getTriple().isOSAIX()) {
-    CmdArgs.push_back("--whole-archive");
-    CmdArgs.push_back(FortranMainLinkFlag);
-    CmdArgs.push_back("--no-whole-archive");
-    return;
-  }
-
-  CmdArgs.push_back(FortranMainLinkFlag);
-}
-
 /// Add Fortran runtime libs
 void tools::addFortranRuntimeLibs(const ToolChain &TC, const ArgList &Args,
                                   llvm::opt::ArgStringList &CmdArgs) {
-  // 1. Link FortranMain
-  // FortranMain depends on FortranRuntime, so needs to be listed first. If
-  // -fno-fortran-main has been passed, skip linking Fortran_main.a
-  if (!Args.hasArg(options::OPT_no_fortran_main))
-    addFortranMain(TC, Args, CmdArgs);
-
-  // 2. Link FortranRuntime and FortranDecimal
+  // Link FortranRuntime and FortranDecimal
   // These are handled earlier on Windows by telling the frontend driver to
   // add the correct libraries to link against as dependents in the object
   // file.
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 6d93c1f3d703..8955b9fb653c 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -282,7 +282,6 @@ static void processVSRuntimeLibrary(const ToolChain &TC, const ArgList &Args,
   assert(TC.getTriple().isKnownWindowsMSVCEnvironment() &&
          "can only add VS runtime library on Windows!");
   // if -fno-fortran-main has been passed, skip linking Fortran_main.a
-  bool LinkFortranMain = !Args.hasArg(options::OPT_no_fortran_main);
   if (TC.getTriple().isKnownWindowsMSVCEnvironment()) {
     CmdArgs.push_back(Args.MakeArgString(
         "--dependent-lib=" + TC.getCompilerRTBasename(Args, "builtins")));
@@ -300,8 +299,6 @@ static void processVSRuntimeLibrary(const ToolChain &TC, const ArgList &Args,
   case options::OPT__SLASH_MT:
     CmdArgs.push_back("-D_MT");
     CmdArgs.push_back("--dependent-lib=libcmt");
-    if (LinkFortranMain)
-      CmdArgs.push_back("--dependent-lib=Fortran_main.static.lib");
     CmdArgs.push_back("--dependent-lib=FortranRuntime.static.lib");
     CmdArgs.push_back("--dependent-lib=FortranDecimal.static.lib");
     break;
@@ -309,8 +306,6 @@ static void processVSRuntimeLibrary(const ToolChain &TC, const ArgList &Args,
     CmdArgs.push_back("-D_MT");
     CmdArgs.push_back("-D_DEBUG");
     CmdArgs.push_back("--dependent-lib=libcmtd");
-    if (LinkFortranMain)
-      CmdArgs.push_back("--dependent-lib=Fortran_main.static_dbg.lib");
     CmdArgs.push_back("--dependent-lib=FortranRuntime.static_dbg.lib");
     CmdArgs.push_back("--dependent-lib=FortranDecimal.static_dbg.lib");
     break;
@@ -318,8 +313,6 @@ static void processVSRuntimeLibrary(const ToolChain &TC, const ArgList &Args,
     CmdArgs.push_back("-D_MT");
     CmdArgs.push_back("-D_DLL");
     CmdArgs.push_back("--dependent-lib=msvcrt");
-    if (LinkFortranMain)
-      CmdArgs.push_back("--dependent-lib=Fortran_main.dynamic.lib");
     CmdArgs.push_back("--dependent-lib=FortranRuntime.dynamic.lib");
     CmdArgs.push_back("--dependent-lib=FortranDecimal.dynamic.lib");
     break;
@@ -328,8 +321,6 @@ static void processVSRuntimeLibrary(const ToolChain &TC, const ArgList &Args,
     CmdArgs.push_back("-D_DEBUG");
     CmdArgs.push_back("-D_DLL");
     CmdArgs.push_back("--dependent-lib=msvcrtd");
-    if (LinkFortranMain)
-      CmdArgs.push_back("--dependent-lib=Fortran_main.dynamic_dbg.lib");
     CmdArgs.push_back("--dependent-lib=FortranRuntime.dynamic_dbg.lib");
     CmdArgs.push_back("--dependent-lib=FortranDecimal.dynamic_dbg.lib");
     break;
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index f55b8bf48c13..9849c59685cc 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -1796,9 +1796,7 @@ selectRISCVMultilib(const MultilibSet &RISCVMultilibSet, StringRef Arch,
       }
       auto &MLConfigISAInfo = *MLConfigParseResult;
 
-      const llvm::RISCVISAInfo::OrderedExtensionMap &MLConfigArchExts =
-          MLConfigISAInfo->getExtensions();
-      for (auto MLConfigArchExt : MLConfigArchExts) {
+      for (auto &MLConfigArchExt : MLConfigISAInfo->getExtensions()) {
         auto ExtName = MLConfigArchExt.first;
         NewMultilib.flag(Twine("-", ExtName).str());
 
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 373dd4e60bf3..c8d8ec3afbd9 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -3116,6 +3116,7 @@ static void sortCppIncludes(const FormatStyle &Style,
     return;
   }
 
+  const auto OldCursor = Cursor ? *Cursor : 0;
   std::string result;
   for (unsigned Index : Indices) {
     if (!result.empty()) {
@@ -3139,6 +3140,8 @@ static void sortCppIncludes(const FormatStyle &Style,
   // the entire range of blocks. Otherwise, no replacement is generated.
   if (replaceCRLF(result) == replaceCRLF(std::string(Code.substr(
                                  IncludesBeginOffset, IncludesBlockSize)))) {
+    if (Cursor)
+      *Cursor = OldCursor;
     return;
   }
 
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index f651e6228c20..28b6488e54a4 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -1623,10 +1623,10 @@ struct AdditionalKeywords {
   IdentifierInfo *kw_then;
 
   /// Returns \c true if \p Tok is a keyword or an identifier.
-  bool isWordLike(const FormatToken &Tok) const {
+  bool isWordLike(const FormatToken &Tok, bool IsVerilog = true) const {
     // getIdentifierinfo returns non-null for keywords as well as identifiers.
     return Tok.Tok.getIdentifierInfo() &&
-           !Tok.isOneOf(kw_verilogHash, kw_verilogHashHash, kw_apostrophe);
+           (!IsVerilog || !isVerilogKeywordSymbol(Tok));
   }
 
   /// Returns \c true if \p Tok is a true JavaScript identifier, returns
@@ -1755,6 +1755,10 @@ struct AdditionalKeywords {
     }
   }
 
+  bool isVerilogKeywordSymbol(const FormatToken &Tok) const {
+    return Tok.isOneOf(kw_verilogHash, kw_verilogHashHash, kw_apostrophe);
+  }
+
   bool isVerilogWordOperator(const FormatToken &Tok) const {
     return Tok.isOneOf(kw_before, kw_intersect, kw_dist, kw_iff, kw_inside,
                        kw_with);
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index cdfb4256e41d..d366ae2080bc 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -4780,9 +4780,14 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
   if (Left.Finalized)
     return Right.hasWhitespaceBefore();
 
+  const bool IsVerilog = Style.isVerilog();
+  assert(!IsVerilog || !IsCpp);
+
   // Never ever merge two words.
-  if (Keywords.isWordLike(Right) && Keywords.isWordLike(Left))
+  if (Keywords.isWordLike(Right, IsVerilog) &&
+      Keywords.isWordLike(Left, IsVerilog)) {
     return true;
+  }
 
   // Leave a space between * and /* to avoid C4138 `comment end` found outside
   // of comment.
@@ -4834,10 +4839,8 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
         Right.is(TT_TemplateOpener)) {
       return true;
     }
-    if (Left.is(tok::identifier) && Right.is(tok::numeric_constant) &&
-        Right.TokenText[0] == '.') {
-      return false;
-    }
+    if (Left.Tok.getIdentifierInfo() && Right.is(tok::numeric_constant))
+      return Right.TokenText[0] != '.';
   } else if (Style.isProto()) {
     if (Right.is(tok::period) &&
         Left.isOneOf(Keywords.kw_optional, Keywords.kw_required,
@@ -5065,12 +5068,10 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
         Right.is(TT_TemplateOpener)) {
       return true;
     }
-  } else if (Style.isVerilog()) {
+  } else if (IsVerilog) {
     // An escaped identifier ends with whitespace.
-    if (Style.isVerilog() && Left.is(tok::identifier) &&
-        Left.TokenText[0] == '\\') {
+    if (Left.is(tok::identifier) && Left.TokenText[0] == '\\')
       return true;
-    }
     // Add space between things in a primitive's state table unless in a
     // transition like `(0?)`.
     if ((Left.is(TT_VerilogTableItem) &&
@@ -5266,21 +5267,11 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
     return true;
   }
   if (Left.is(TT_UnaryOperator)) {
-    if (Right.isNot(tok::l_paren)) {
-      // The alternative operators for ~ and ! are "compl" and "not".
-      // If they are used instead, we do not want to combine them with
-      // the token to the right, unless that is a left paren.
-      if (Left.is(tok::exclaim) && Left.TokenText == "not")
-        return true;
-      if (Left.is(tok::tilde) && Left.TokenText == "compl")
-        return true;
-      // Lambda captures allow for a lone &, so "&]" needs to be properly
-      // handled.
-      if (Left.is(tok::amp) && Right.is(tok::r_square))
-        return Style.SpacesInSquareBrackets;
-    }
-    return (Style.SpaceAfterLogicalNot && Left.is(tok::exclaim)) ||
-           Right.is(TT_BinaryOperator);
+    // Lambda captures allow for a lone &, so "&]" needs to be properly
+    // handled.
+    if (Left.is(tok::amp) && Right.is(tok::r_square))
+      return Style.SpacesInSquareBrackets;
+    return Style.SpaceAfterLogicalNot && Left.is(tok::exclaim);
   }
 
   // If the next token is a binary operator or a selector name, we have
diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
index 4f822807dd98..44fd807ec27e 100644
--- a/clang/lib/Format/WhitespaceManager.cpp
+++ b/clang/lib/Format/WhitespaceManager.cpp
@@ -128,11 +128,14 @@ const tooling::Replacements &WhitespaceManager::generateReplacements() {
 void WhitespaceManager::calculateLineBreakInformation() {
   Changes[0].PreviousEndOfTokenColumn = 0;
   Change *LastOutsideTokenChange = &Changes[0];
-  for (unsigned i = 1, e = Changes.size(); i != e; ++i) {
+  for (unsigned I = 1, e = Changes.size(); I != e; ++I) {
+    auto &C = Changes[I];
+    auto &P = Changes[I - 1];
+    auto &PrevTokLength = P.TokenLength;
     SourceLocation OriginalWhitespaceStart =
-        Changes[i].OriginalWhitespaceRange.getBegin();
+        C.OriginalWhitespaceRange.getBegin();
     SourceLocation PreviousOriginalWhitespaceEnd =
-        Changes[i - 1].OriginalWhitespaceRange.getEnd();
+        P.OriginalWhitespaceRange.getEnd();
     unsigned OriginalWhitespaceStartOffset =
         SourceMgr.getFileOffset(OriginalWhitespaceStart);
     unsigned PreviousOriginalWhitespaceEndOffset =
@@ -167,31 +170,28 @@ void WhitespaceManager::calculateLineBreakInformation() {
     // line of the token.
     auto NewlinePos = Text.find_first_of('\n');
     if (NewlinePos == StringRef::npos) {
-      Changes[i - 1].TokenLength = OriginalWhitespaceStartOffset -
-                                   PreviousOriginalWhitespaceEndOffset +
-                                   Changes[i].PreviousLinePostfix.size() +
-                                   Changes[i - 1].CurrentLinePrefix.size();
+      PrevTokLength = OriginalWhitespaceStartOffset -
+                      PreviousOriginalWhitespaceEndOffset +
+                      C.PreviousLinePostfix.size() + P.CurrentLinePrefix.size();
+      if (!P.IsInsideToken)
+        PrevTokLength = std::min(PrevTokLength, P.Tok->ColumnWidth);
     } else {
-      Changes[i - 1].TokenLength =
-          NewlinePos + Changes[i - 1].CurrentLinePrefix.size();
+      PrevTokLength = NewlinePos + P.CurrentLinePrefix.size();
     }
 
     // If there are multiple changes in this token, sum up all the changes until
     // the end of the line.
-    if (Changes[i - 1].IsInsideToken && Changes[i - 1].NewlinesBefore == 0) {
-      LastOutsideTokenChange->TokenLength +=
-          Changes[i - 1].TokenLength + Changes[i - 1].Spaces;
-    } else {
-      LastOutsideTokenChange = &Changes[i - 1];
-    }
+    if (P.IsInsideToken && P.NewlinesBefore == 0)
+      LastOutsideTokenChange->TokenLength += PrevTokLength + P.Spaces;
+    else
+      LastOutsideTokenChange = &P;
 
-    Changes[i].PreviousEndOfTokenColumn =
-        Changes[i - 1].StartOfTokenColumn + Changes[i - 1].TokenLength;
+    C.PreviousEndOfTokenColumn = P.StartOfTokenColumn + PrevTokLength;
 
-    Changes[i - 1].IsTrailingComment =
-        (Changes[i].NewlinesBefore > 0 || Changes[i].Tok->is(tok::eof) ||
-         (Changes[i].IsInsideToken && Changes[i].Tok->is(tok::comment))) &&
-        Changes[i - 1].Tok->is(tok::comment) &&
+    P.IsTrailingComment =
+        (C.NewlinesBefore > 0 || C.Tok->is(tok::eof) ||
+         (C.IsInsideToken && C.Tok->is(tok::comment))) &&
+        P.Tok->is(tok::comment) &&
         // FIXME: This is a dirty hack. The problem is that
         // BreakableLineCommentSection does comment reflow changes and here is
         // the aligning of trailing comments. Consider the case where we reflow
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 6bdd734e8a27..c1d209466ffe 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -703,7 +703,7 @@ static void InitializeCPlusPlusFeatureTestMacros(const LangOptions &LangOpts,
     Builder.defineMacro("__cpp_nested_namespace_definitions", "201411L");
     Builder.defineMacro("__cpp_variadic_using", "201611L");
     Builder.defineMacro("__cpp_aggregate_bases", "201603L");
-    Builder.defineMacro("__cpp_structured_bindings", "201606L");
+    Builder.defineMacro("__cpp_structured_bindings", "202403L");
     Builder.defineMacro("__cpp_nontype_template_args",
                         "201411L"); // (not latest)
     Builder.defineMacro("__cpp_fold_expressions", "201603L");
@@ -1308,6 +1308,16 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
     Builder.defineMacro("__GCC_ATOMIC_TEST_AND_SET_TRUEVAL", "1");
   }
 
+  // GCC defines these macros in both C and C++ modes despite them being needed
+  // mostly for STL implementations in C++.
+  auto [Destructive, Constructive] = TI.hardwareInterferenceSizes();
+  Builder.defineMacro("__GCC_DESTRUCTIVE_SIZE", Twine(Destructive));
+  Builder.defineMacro("__GCC_CONSTRUCTIVE_SIZE", Twine(Constructive));
+  // We need to use push_macro to allow users to redefine these macros from the
+  // command line with -D and not issue a -Wmacro-redefined warning.
+  Builder.append("#pragma push_macro(\"__GCC_DESTRUCTIVE_SIZE\")");
+  Builder.append("#pragma push_macro(\"__GCC_CONSTRUCTIVE_SIZE\")");
+
   auto addLockFreeMacros = [&](const llvm::Twine &Prefix) {
     // Used by libc++ and libstdc++ to implement ATOMIC_<foo>_LOCK_FREE.
 #define DEFINE_LOCK_FREE_MACRO(TYPE, Type)                                     \
diff --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h
index 0bb9912b465f..bb7692efb78f 100644
--- a/clang/lib/Headers/cpuid.h
+++ b/clang/lib/Headers/cpuid.h
@@ -10,7 +10,7 @@
 #ifndef __CPUID_H
 #define __CPUID_H
 
-#if !(__x86_64__ || __i386__)
+#if !defined(__x86_64__) && !defined(__i386__)
 #error this header is for x86 only
 #endif
 
@@ -256,7 +256,7 @@
 #define bit_AVX10_256   0x00020000
 #define bit_AVX10_512   0x00040000
 
-#if __i386__
+#ifdef __i386__
 #define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \
     __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
                   : "0"(__leaf))
@@ -285,7 +285,7 @@ static __inline unsigned int __get_cpuid_max (unsigned int __leaf,
                                               unsigned int *__sig)
 {
     unsigned int __eax, __ebx, __ecx, __edx;
-#if __i386__
+#ifdef __i386__
     int __cpuid_supported;
 
     __asm("  pushfl\n"
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 53a33fa4add5..a7846e102a43 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -2998,7 +2998,7 @@ bool Parser::ParseImplicitInt(DeclSpec &DS, CXXScopeSpec *SS,
         << TokenName << TagName << getLangOpts().CPlusPlus
         << FixItHint::CreateInsertion(Tok.getLocation(), FixitTagName);
 
-      if (Actions.LookupName(R, getCurScope())) {
+      if (Actions.LookupParsedName(R, getCurScope(), SS)) {
         for (LookupResult::iterator I = R.begin(), IEnd = R.end();
              I != IEnd; ++I)
           Diag((*I)->getLocation(), diag::note_decl_hiding_tag_type)
@@ -7038,18 +7038,23 @@ void Parser::ParseDirectDeclarator(Declarator &D) {
 void Parser::ParseDecompositionDeclarator(Declarator &D) {
   assert(Tok.is(tok::l_square));
 
+  TentativeParsingAction PA(*this);
+  BalancedDelimiterTracker T(*this, tok::l_square);
+  T.consumeOpen();
+
+  if (isCXX11AttributeSpecifier())
+    DiagnoseAndSkipCXX11Attributes();
+
   // If this doesn't look like a structured binding, maybe it's a misplaced
   // array declarator.
-  // FIXME: Consume the l_square first so we don't need extra lookahead for
-  // this.
-  if (!(NextToken().is(tok::identifier) &&
-        GetLookAheadToken(2).isOneOf(tok::comma, tok::r_square)) &&
-      !(NextToken().is(tok::r_square) &&
-        GetLookAheadToken(2).isOneOf(tok::equal, tok::l_brace)))
+  if (!(Tok.is(tok::identifier) &&
+        NextToken().isOneOf(tok::comma, tok::r_square, tok::kw_alignas,
+                            tok::l_square)) &&
+      !(Tok.is(tok::r_square) &&
+        NextToken().isOneOf(tok::equal, tok::l_brace))) {
+    PA.Revert();
     return ParseMisplacedBracketDeclarator(D);
-
-  BalancedDelimiterTracker T(*this, tok::l_square);
-  T.consumeOpen();
+  }
 
   SmallVector<DecompositionDeclarator::Binding, 32> Bindings;
   while (Tok.isNot(tok::r_square)) {
@@ -7074,13 +7079,27 @@ void Parser::ParseDecompositionDeclarator(Declarator &D) {
       }
     }
 
+    if (isCXX11AttributeSpecifier())
+      DiagnoseAndSkipCXX11Attributes();
+
     if (Tok.isNot(tok::identifier)) {
       Diag(Tok, diag::err_expected) << tok::identifier;
       break;
     }
 
-    Bindings.push_back({Tok.getIdentifierInfo(), Tok.getLocation()});
+    IdentifierInfo *II = Tok.getIdentifierInfo();
+    SourceLocation Loc = Tok.getLocation();
     ConsumeToken();
+
+    ParsedAttributes Attrs(AttrFactory);
+    if (isCXX11AttributeSpecifier()) {
+      Diag(Tok, getLangOpts().CPlusPlus26
+                    ? diag::warn_cxx23_compat_decl_attrs_on_binding
+                    : diag::ext_decl_attrs_on_binding);
+      MaybeParseCXX11Attributes(Attrs);
+    }
+
+    Bindings.push_back({II, Loc, std::move(Attrs)});
   }
 
   if (Tok.isNot(tok::r_square))
@@ -7095,6 +7114,8 @@ void Parser::ParseDecompositionDeclarator(Declarator &D) {
     T.consumeClose();
   }
 
+  PA.Commit();
+
   return D.setDecompositionBindings(T.getOpenLocation(), Bindings,
                                     T.getCloseLocation());
 }
diff --git a/clang/lib/Sema/DeclSpec.cpp b/clang/lib/Sema/DeclSpec.cpp
index b79683bb32a6..5f63c857c430 100644
--- a/clang/lib/Sema/DeclSpec.cpp
+++ b/clang/lib/Sema/DeclSpec.cpp
@@ -293,7 +293,7 @@ DeclaratorChunk DeclaratorChunk::getFunction(bool hasProto,
 
 void Declarator::setDecompositionBindings(
     SourceLocation LSquareLoc,
-    ArrayRef<DecompositionDeclarator::Binding> Bindings,
+    MutableArrayRef<DecompositionDeclarator::Binding> Bindings,
     SourceLocation RSquareLoc) {
   assert(!hasName() && "declarator given multiple names!");
 
@@ -317,7 +317,7 @@ void Declarator::setDecompositionBindings(
           new DecompositionDeclarator::Binding[Bindings.size()];
       BindingGroup.DeleteBindings = true;
     }
-    std::uninitialized_copy(Bindings.begin(), Bindings.end(),
+    std::uninitialized_move(Bindings.begin(), Bindings.end(),
                             BindingGroup.Bindings);
   }
 }
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index bb283c54b3d2..1a1febf7a352 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -126,15 +126,12 @@ struct BuiltinTypeDeclBuilder {
 
   static DeclRefExpr *lookupBuiltinFunction(ASTContext &AST, Sema &S,
                                             StringRef Name) {
+    CXXScopeSpec SS;
     IdentifierInfo &II = AST.Idents.get(Name, tok::TokenKind::identifier);
     DeclarationNameInfo NameInfo =
         DeclarationNameInfo(DeclarationName(&II), SourceLocation());
     LookupResult R(S, NameInfo, Sema::LookupOrdinaryName);
-    // AllowBuiltinCreation is false but LookupDirect will create
-    // the builtin when searching the global scope anyways...
-    S.LookupName(R, S.getCurScope());
-    // FIXME: If the builtin function was user-declared in global scope,
-    // this assert *will* fail. Should this call LookupBuiltin instead?
+    S.LookupParsedName(R, S.getCurScope(), &SS, false);
     assert(R.isSingleResult() &&
            "Since this is a builtin it should always resolve!");
     auto *VD = cast<ValueDecl>(R.getFoundDecl());
diff --git a/clang/lib/Sema/SemaAPINotes.cpp b/clang/lib/Sema/SemaAPINotes.cpp
index 4c445f28bba8..c5998aca0d72 100644
--- a/clang/lib/Sema/SemaAPINotes.cpp
+++ b/clang/lib/Sema/SemaAPINotes.cpp
@@ -594,6 +594,11 @@ static void ProcessAPINotes(Sema &S, TagDecl *D, const api_notes::TagInfo &Info,
     D->addAttr(
         SwiftAttrAttr::Create(S.Context, "release:" + ReleaseOp.value()));
 
+  if (auto Copyable = Info.isSwiftCopyable()) {
+    if (!*Copyable)
+      D->addAttr(SwiftAttrAttr::Create(S.Context, "~Copyable"));
+  }
+
   if (auto Extensibility = Info.EnumExtensibility) {
     using api_notes::EnumExtensibilityKind;
     bool ShouldAddAttribute = (*Extensibility != EnumExtensibilityKind::None);
diff --git a/clang/lib/Sema/SemaAttr.cpp b/clang/lib/Sema/SemaAttr.cpp
index a83b1e8afadb..a5dd158808f2 100644
--- a/clang/lib/Sema/SemaAttr.cpp
+++ b/clang/lib/Sema/SemaAttr.cpp
@@ -837,7 +837,7 @@ void Sema::ActOnPragmaUnused(const Token &IdTok, Scope *curScope,
 
   IdentifierInfo *Name = IdTok.getIdentifierInfo();
   LookupResult Lookup(*this, Name, IdTok.getLocation(), LookupOrdinaryName);
-  LookupName(Lookup, curScope, /*AllowBuiltinCreation=*/true);
+  LookupParsedName(Lookup, curScope, nullptr, true);
 
   if (Lookup.empty()) {
     Diag(PragmaLoc, diag::warn_pragma_unused_undeclared_var)
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index e33113ab9c4c..cf8840c63024 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3164,13 +3164,20 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
 
     const Expr *Arg = TheCall->getArg(0);
     const auto *TyA = Arg->getType()->getAs<VectorType>();
-    if (!TyA) {
+
+    QualType ElTy;
+    if (TyA)
+      ElTy = TyA->getElementType();
+    else if (Arg->getType()->isSizelessVectorType())
+      ElTy = Arg->getType()->getSizelessVectorEltType(Context);
+
+    if (ElTy.isNull()) {
       Diag(Arg->getBeginLoc(), diag::err_builtin_invalid_arg_type)
           << 1 << /* vector ty*/ 4 << Arg->getType();
       return ExprError();
     }
 
-    TheCall->setType(TyA->getElementType());
+    TheCall->setType(ElTy);
     break;
   }
 
@@ -3186,12 +3193,20 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
 
     const Expr *Arg = TheCall->getArg(0);
     const auto *TyA = Arg->getType()->getAs<VectorType>();
-    if (!TyA || !TyA->getElementType()->isIntegerType()) {
+
+    QualType ElTy;
+    if (TyA)
+      ElTy = TyA->getElementType();
+    else if (Arg->getType()->isSizelessVectorType())
+      ElTy = Arg->getType()->getSizelessVectorEltType(Context);
+
+    if (ElTy.isNull() || !ElTy->isIntegerType()) {
       Diag(Arg->getBeginLoc(), diag::err_builtin_invalid_arg_type)
           << 1  << /* vector of integers */ 6 << Arg->getType();
       return ExprError();
     }
-    TheCall->setType(TyA->getElementType());
+
+    TheCall->setType(ElTy);
     break;
   }
 
@@ -12544,6 +12559,17 @@ CheckPrintfHandler::checkFormatExpr(const analyze_printf::PrintfSpecifier &FS,
     return true;
   }
 
+  // Diagnose attempts to use '%P' with ObjC object types, which will result in
+  // dumping raw class data (like is-a pointer), not actual data.
+  if (FS.getConversionSpecifier().getKind() == ConversionSpecifier::PArg &&
+      ExprTy->isObjCObjectPointerType()) {
+    const CharSourceRange &CSR =
+        getSpecifierRange(StartSpecifier, SpecifierLen);
+    EmitFormatDiagnostic(S.PDiag(diag::warn_format_P_with_objc_pointer),
+                         E->getExprLoc(), false, CSR);
+    return true;
+  }
+
   ArgType::MatchKind ImplicitMatch = ArgType::NoMatch;
   ArgType::MatchKind Match = AT.matchesType(S.Context, ExprTy);
   ArgType::MatchKind OrigMatch = Match;
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 4e275dc15fbb..671752b56e01 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -832,7 +832,7 @@ static bool isTagTypeWithMissingTag(Sema &SemaRef, LookupResult &Result,
                                     IdentifierInfo *&Name,
                                     SourceLocation NameLoc) {
   LookupResult R(SemaRef, Name, NameLoc, Sema::LookupTagName);
-  SemaRef.LookupParsedName(R, S, &SS, /*ObjectType=*/QualType());
+  SemaRef.LookupParsedName(R, S, &SS);
   if (TagDecl *Tag = R.getAsSingle<TagDecl>()) {
     StringRef FixItTagName;
     switch (Tag->getTagKind()) {
@@ -869,7 +869,7 @@ static bool isTagTypeWithMissingTag(Sema &SemaRef, LookupResult &Result,
 
     // Replace lookup results with just the tag decl.
     Result.clear(Sema::LookupTagName);
-    SemaRef.LookupParsedName(Result, S, &SS, /*ObjectType=*/QualType());
+    SemaRef.LookupParsedName(Result, S, &SS);
     return true;
   }
 
@@ -896,8 +896,7 @@ Sema::NameClassification Sema::ClassifyName(Scope *S, CXXScopeSpec &SS,
   }
 
   LookupResult Result(*this, Name, NameLoc, LookupOrdinaryName);
-  LookupParsedName(Result, S, &SS, /*ObjectType=*/QualType(),
-                   /*AllowBuiltinCreation=*/!CurMethod);
+  LookupParsedName(Result, S, &SS, !CurMethod);
 
   if (SS.isInvalid())
     return NameClassification::Error();
@@ -1975,7 +1974,7 @@ static bool ShouldDiagnoseUnusedDecl(const LangOptions &LangOpts,
     // it is, by the bindings' expressions).
     bool IsAllPlaceholders = true;
     for (const auto *BD : DD->bindings()) {
-      if (BD->isReferenced())
+      if (BD->isReferenced() || BD->hasAttr<UnusedAttr>())
         return false;
       IsAllPlaceholders = IsAllPlaceholders && BD->isPlaceholderVar(LangOpts);
     }
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 4d5836720a65..338b0ec1e099 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -910,6 +910,8 @@ Sema::ActOnDecompositionDeclarator(Scope *S, Declarator &D,
 
     auto *BD = BindingDecl::Create(Context, DC, B.NameLoc, VarName);
 
+    ProcessDeclAttributeList(S, BD, *B.Attrs);
+
     // Find the shadowed declaration before filtering for scope.
     NamedDecl *ShadowedDecl = D.getCXXScopeSpec().isEmpty()
                                   ? getShadowedDeclaration(BD, Previous)
@@ -4517,7 +4519,7 @@ Sema::BuildMemInitializer(Decl *ConstructorD,
                               DS.getBeginLoc(), DS.getEllipsisLoc());
   } else {
     LookupResult R(*this, MemberOrBase, IdLoc, LookupOrdinaryName);
-    LookupParsedName(R, S, &SS, /*ObjectType=*/QualType());
+    LookupParsedName(R, S, &SS);
 
     TypeDecl *TyD = R.getAsSingle<TypeDecl>();
     if (!TyD) {
@@ -12052,11 +12054,17 @@ bool Sema::isStdInitializerList(QualType Ty, QualType *Element) {
 
     Template = Specialization->getSpecializedTemplate();
     Arguments = Specialization->getTemplateArgs().data();
-  } else if (const TemplateSpecializationType *TST =
-                 Ty->getAs<TemplateSpecializationType>()) {
-    Template = dyn_cast_or_null<ClassTemplateDecl>(
-        TST->getTemplateName().getAsTemplateDecl());
-    Arguments = TST->template_arguments().begin();
+  } else {
+    const TemplateSpecializationType *TST = nullptr;
+    if (auto *ICN = Ty->getAs<InjectedClassNameType>())
+      TST = ICN->getInjectedTST();
+    else
+      TST = Ty->getAs<TemplateSpecializationType>();
+    if (TST) {
+      Template = dyn_cast_or_null<ClassTemplateDecl>(
+          TST->getTemplateName().getAsTemplateDecl());
+      Arguments = TST->template_arguments().begin();
+    }
   }
   if (!Template)
     return false;
@@ -12262,7 +12270,7 @@ Decl *Sema::ActOnUsingDirective(Scope *S, SourceLocation UsingLoc,
 
   // Lookup namespace name.
   LookupResult R(*this, NamespcName, IdentLoc, LookupNamespaceName);
-  LookupParsedName(R, S, &SS, /*ObjectType=*/QualType());
+  LookupParsedName(R, S, &SS);
   if (R.isAmbiguous())
     return nullptr;
 
@@ -13721,7 +13729,7 @@ Decl *Sema::ActOnNamespaceAliasDef(Scope *S, SourceLocation NamespaceLoc,
 
   // Lookup the namespace name.
   LookupResult R(*this, Ident, IdentLoc, LookupNamespaceName);
-  LookupParsedName(R, S, &SS, /*ObjectType=*/QualType());
+  LookupParsedName(R, S, &SS);
 
   if (R.isAmbiguous())
     return nullptr;
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 0c37f43f7540..50f92c496a53 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -673,9 +673,8 @@ ExprResult Sema::DefaultLvalueConversion(Expr *E) {
   // expressions of certain types in C++.
   if (getLangOpts().CPlusPlus &&
       (E->getType() == Context.OverloadTy ||
-       // FIXME: This is a hack! We want the lvalue-to-rvalue conversion applied
-       // to pointer types even if the pointee type is dependent.
-       (T->isDependentType() && !T->isPointerType()) || T->isRecordType()))
+       T->isDependentType() ||
+       T->isRecordType()))
     return E;
 
   // The C standard is actually really unclear on this point, and
@@ -2752,8 +2751,8 @@ Sema::ActOnIdExpression(Scope *S, CXXScopeSpec &SS,
   if (isBoundsAttrContext() && !getLangOpts().CPlusPlus && S->isClassScope()) {
     // See if this is reference to a field of struct.
     LookupResult R(*this, NameInfo, LookupMemberName);
-    // LookupName handles a name lookup from within anonymous struct.
-    if (LookupName(R, S)) {
+    // LookupParsedName handles a name lookup from within anonymous struct.
+    if (LookupParsedName(R, S, &SS)) {
       if (auto *VD = dyn_cast<ValueDecl>(R.getFoundDecl())) {
         QualType type = VD->getType().getNonReferenceType();
         // This will eventually be translated into MemberExpr upon
@@ -2774,19 +2773,20 @@ Sema::ActOnIdExpression(Scope *S, CXXScopeSpec &SS,
     // lookup to determine that it was a template name in the first place. If
     // this becomes a performance hit, we can work harder to preserve those
     // results until we get here but it's likely not worth it.
+    bool MemberOfUnknownSpecialization;
     AssumedTemplateKind AssumedTemplate;
-    if (LookupTemplateName(R, S, SS, /*ObjectType=*/QualType(),
-                           /*EnteringContext=*/false, TemplateKWLoc,
+    if (LookupTemplateName(R, S, SS, QualType(), /*EnteringContext=*/false,
+                           MemberOfUnknownSpecialization, TemplateKWLoc,
                            &AssumedTemplate))
       return ExprError();
 
-    if (R.wasNotFoundInCurrentInstantiation())
+    if (MemberOfUnknownSpecialization ||
+        (R.getResultKind() == LookupResult::NotFoundInCurrentInstantiation))
       return ActOnDependentIdExpression(SS, TemplateKWLoc, NameInfo,
                                         IsAddressOfOperand, TemplateArgs);
   } else {
     bool IvarLookupFollowUp = II && !SS.isSet() && getCurMethodDecl();
-    LookupParsedName(R, S, &SS, /*ObjectType=*/QualType(),
-                     /*AllowBuiltinCreation=*/!IvarLookupFollowUp);
+    LookupParsedName(R, S, &SS, !IvarLookupFollowUp);
 
     // If the result might be in a dependent base class, this is a dependent
     // id-expression.
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index c1cb03e4ec7a..779a41620033 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -9157,7 +9157,7 @@ Sema::CheckMicrosoftIfExistsSymbol(Scope *S,
   // Do the redeclaration lookup in the current scope.
   LookupResult R(*this, TargetNameInfo, Sema::LookupAnyName,
                  RedeclarationKind::NotForRedeclaration);
-  LookupParsedName(R, S, &SS, /*ObjectType=*/QualType());
+  LookupParsedName(R, S, &SS);
   R.suppressDiagnostics();
 
   switch (R.getResultKind()) {
diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp
index 14dde1bff8fb..6e30716b9ae4 100644
--- a/clang/lib/Sema/SemaExprMember.cpp
+++ b/clang/lib/Sema/SemaExprMember.cpp
@@ -667,8 +667,8 @@ namespace {
 // classes, one of its base classes.
 class RecordMemberExprValidatorCCC final : public CorrectionCandidateCallback {
 public:
-  explicit RecordMemberExprValidatorCCC(QualType RTy)
-      : Record(RTy->getAsRecordDecl()) {
+  explicit RecordMemberExprValidatorCCC(const RecordType *RTy)
+      : Record(RTy->getDecl()) {
     // Don't add bare keywords to the consumer since they will always fail
     // validation by virtue of not being associated with any decls.
     WantTypeSpecifiers = false;
@@ -713,36 +713,58 @@ private:
 }
 
 static bool LookupMemberExprInRecord(Sema &SemaRef, LookupResult &R,
-                                     Expr *BaseExpr, QualType RTy,
+                                     Expr *BaseExpr,
+                                     const RecordType *RTy,
                                      SourceLocation OpLoc, bool IsArrow,
                                      CXXScopeSpec &SS, bool HasTemplateArgs,
                                      SourceLocation TemplateKWLoc,
                                      TypoExpr *&TE) {
   SourceRange BaseRange = BaseExpr ? BaseExpr->getSourceRange() : SourceRange();
-  if (!RTy->isDependentType() &&
-      !SemaRef.isThisOutsideMemberFunctionBody(RTy) &&
-      SemaRef.RequireCompleteType(
-          OpLoc, RTy, diag::err_typecheck_incomplete_tag, BaseRange))
+  RecordDecl *RDecl = RTy->getDecl();
+  if (!SemaRef.isThisOutsideMemberFunctionBody(QualType(RTy, 0)) &&
+      SemaRef.RequireCompleteType(OpLoc, QualType(RTy, 0),
+                                  diag::err_typecheck_incomplete_tag,
+                                  BaseRange))
     return true;
 
-  // LookupTemplateName/LookupParsedName don't expect these both to exist
-  // simultaneously.
-  QualType ObjectType = SS.isSet() ? QualType() : RTy;
-  if (HasTemplateArgs || TemplateKWLoc.isValid())
-    return SemaRef.LookupTemplateName(R,
-                                      /*S=*/nullptr, SS, ObjectType,
-                                      /*EnteringContext=*/false, TemplateKWLoc);
+  if (HasTemplateArgs || TemplateKWLoc.isValid()) {
+    // LookupTemplateName doesn't expect these both to exist simultaneously.
+    QualType ObjectType = SS.isSet() ? QualType() : QualType(RTy, 0);
 
-  SemaRef.LookupParsedName(R, /*S=*/nullptr, &SS, ObjectType);
+    bool MOUS;
+    return SemaRef.LookupTemplateName(R, nullptr, SS, ObjectType, false, MOUS,
+                                      TemplateKWLoc);
+  }
+
+  DeclContext *DC = RDecl;
+  if (SS.isSet()) {
+    // If the member name was a qualified-id, look into the
+    // nested-name-specifier.
+    DC = SemaRef.computeDeclContext(SS, false);
+
+    if (SemaRef.RequireCompleteDeclContext(SS, DC)) {
+      SemaRef.Diag(SS.getRange().getEnd(), diag::err_typecheck_incomplete_tag)
+          << SS.getRange() << DC;
+      return true;
+    }
+
+    assert(DC && "Cannot handle non-computable dependent contexts in lookup");
+
+    if (!isa<TypeDecl>(DC)) {
+      SemaRef.Diag(R.getNameLoc(), diag::err_qualified_member_nonclass)
+          << DC << SS.getRange();
+      return true;
+    }
+  }
 
-  if (!R.empty() || R.wasNotFoundInCurrentInstantiation())
+  // The record definition is complete, now look up the member.
+  SemaRef.LookupQualifiedName(R, DC, SS);
+
+  if (!R.empty())
     return false;
 
   DeclarationName Typo = R.getLookupName();
   SourceLocation TypoLoc = R.getNameLoc();
-  // Recompute the lookup context.
-  DeclContext *DC = SS.isSet() ? SemaRef.computeDeclContext(SS)
-                               : SemaRef.computeDeclContext(RTy);
 
   struct QueryState {
     Sema &SemaRef;
@@ -766,8 +788,7 @@ static bool LookupMemberExprInRecord(Sema &SemaRef, LookupResult &R,
                                        << Typo << DC << DroppedSpecifier
                                        << SS.getRange());
         } else {
-          SemaRef.Diag(TypoLoc, diag::err_no_member)
-              << Typo << DC << (SS.isSet() ? SS.getRange() : BaseRange);
+          SemaRef.Diag(TypoLoc, diag::err_no_member) << Typo << DC << BaseRange;
         }
       },
       [=](Sema &SemaRef, TypoExpr *TE, TypoCorrection TC) mutable {
@@ -793,25 +814,34 @@ static ExprResult LookupMemberExpr(Sema &S, LookupResult &R,
                                    Decl *ObjCImpDecl, bool HasTemplateArgs,
                                    SourceLocation TemplateKWLoc);
 
-ExprResult Sema::BuildMemberReferenceExpr(
-    Expr *Base, QualType BaseType, SourceLocation OpLoc, bool IsArrow,
-    CXXScopeSpec &SS, SourceLocation TemplateKWLoc,
-    NamedDecl *FirstQualifierInScope, const DeclarationNameInfo &NameInfo,
-    const TemplateArgumentListInfo *TemplateArgs, const Scope *S,
-    ActOnMemberAccessExtraArgs *ExtraArgs) {
-  LookupResult R(*this, NameInfo, LookupMemberName);
+ExprResult
+Sema::BuildMemberReferenceExpr(Expr *Base, QualType BaseType,
+                               SourceLocation OpLoc, bool IsArrow,
+                               CXXScopeSpec &SS,
+                               SourceLocation TemplateKWLoc,
+                               NamedDecl *FirstQualifierInScope,
+                               const DeclarationNameInfo &NameInfo,
+                               const TemplateArgumentListInfo *TemplateArgs,
+                               const Scope *S,
+                               ActOnMemberAccessExtraArgs *ExtraArgs) {
+  if (BaseType->isDependentType() ||
+      (SS.isSet() && isDependentScopeSpecifier(SS)) ||
+      NameInfo.getName().isDependentName())
+    return ActOnDependentMemberExpr(Base, BaseType,
+                                    IsArrow, OpLoc,
+                                    SS, TemplateKWLoc, FirstQualifierInScope,
+                                    NameInfo, TemplateArgs);
 
-  if (SS.isInvalid())
-    return ExprError();
+  LookupResult R(*this, NameInfo, LookupMemberName);
 
   // Implicit member accesses.
   if (!Base) {
     TypoExpr *TE = nullptr;
     QualType RecordTy = BaseType;
     if (IsArrow) RecordTy = RecordTy->castAs<PointerType>()->getPointeeType();
-    if (LookupMemberExprInRecord(*this, R, nullptr, RecordTy, OpLoc, IsArrow,
-                                 SS, TemplateArgs != nullptr, TemplateKWLoc,
-                                 TE))
+    if (LookupMemberExprInRecord(
+            *this, R, nullptr, RecordTy->castAs<RecordType>(), OpLoc, IsArrow,
+            SS, TemplateArgs != nullptr, TemplateKWLoc, TE))
       return ExprError();
     if (TE)
       return TE;
@@ -1003,12 +1033,6 @@ Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType BaseExprType,
                                const Scope *S,
                                bool SuppressQualifierCheck,
                                ActOnMemberAccessExtraArgs *ExtraArgs) {
-  assert(!SS.isInvalid() && "nested-name-specifier cannot be invalid");
-  if (R.wasNotFoundInCurrentInstantiation())
-    return ActOnDependentMemberExpr(BaseExpr, BaseExprType, IsArrow, OpLoc, SS,
-                                    TemplateKWLoc, FirstQualifierInScope,
-                                    R.getLookupNameInfo(), TemplateArgs);
-
   QualType BaseType = BaseExprType;
   if (IsArrow) {
     assert(BaseType->isPointerType());
@@ -1016,11 +1040,6 @@ Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType BaseExprType,
   }
   R.setBaseObjectType(BaseType);
 
-  assert((SS.isEmpty()
-              ? !BaseType->isDependentType() || computeDeclContext(BaseType)
-              : !isDependentScopeSpecifier(SS) || computeDeclContext(SS)) &&
-         "dependent lookup context that isn't the current instantiation?");
-
   // C++1z [expr.ref]p2:
   //   For the first option (dot) the first expression shall be a glvalue [...]
   if (!IsArrow && BaseExpr && BaseExpr->isPRValue()) {
@@ -1050,11 +1069,13 @@ Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType BaseExprType,
 
   if (R.empty()) {
     // Rederive where we looked up.
-    DeclContext *DC =
-        (SS.isSet() ? computeDeclContext(SS) : computeDeclContext(BaseType));
+    DeclContext *DC = (SS.isSet()
+                       ? computeDeclContext(SS, false)
+                       : BaseType->castAs<RecordType>()->getDecl());
+
     if (ExtraArgs) {
       ExprResult RetryExpr;
-      if (!IsArrow && BaseExpr && !BaseExpr->isTypeDependent()) {
+      if (!IsArrow && BaseExpr) {
         SFINAETrap Trap(*this, true);
         ParsedType ObjectType;
         bool MayBePseudoDestructor = false;
@@ -1077,12 +1098,9 @@ Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType BaseExprType,
       }
     }
 
-    assert(DC);
     Diag(R.getNameLoc(), diag::err_no_member)
-        << MemberName << DC
-        << (SS.isSet()
-                ? SS.getRange()
-                : (BaseExpr ? BaseExpr->getSourceRange() : SourceRange()));
+      << MemberName << DC
+      << (BaseExpr ? BaseExpr->getSourceRange() : SourceRange());
     return ExprError();
   }
 
@@ -1312,6 +1330,7 @@ static ExprResult LookupMemberExpr(Sema &S, LookupResult &R,
     return ExprError();
 
   QualType BaseType = BaseExpr.get()->getType();
+  assert(!BaseType->isDependentType());
 
   DeclarationName MemberName = R.getLookupName();
   SourceLocation MemberLoc = R.getNameLoc();
@@ -1323,31 +1342,29 @@ static ExprResult LookupMemberExpr(Sema &S, LookupResult &R,
   if (IsArrow) {
     if (const PointerType *Ptr = BaseType->getAs<PointerType>())
       BaseType = Ptr->getPointeeType();
-    else if (!BaseType->isDependentType()) {
-      if (const ObjCObjectPointerType *Ptr =
-              BaseType->getAs<ObjCObjectPointerType>())
-        BaseType = Ptr->getPointeeType();
-      else if (BaseType->isRecordType()) {
-        // Recover from arrow accesses to records, e.g.:
-        //   struct MyRecord foo;
-        //   foo->bar
-        // This is actually well-formed in C++ if MyRecord has an
-        // overloaded operator->, but that should have been dealt with
-        // by now--or a diagnostic message already issued if a problem
-        // was encountered while looking for the overloaded operator->.
-        if (!S.getLangOpts().CPlusPlus) {
-          S.Diag(OpLoc, diag::err_typecheck_member_reference_suggestion)
-              << BaseType << int(IsArrow) << BaseExpr.get()->getSourceRange()
-              << FixItHint::CreateReplacement(OpLoc, ".");
-        }
-        IsArrow = false;
-      } else if (BaseType->isFunctionType()) {
-        goto fail;
-      } else {
-        S.Diag(MemberLoc, diag::err_typecheck_member_reference_arrow)
-            << BaseType << BaseExpr.get()->getSourceRange();
-        return ExprError();
+    else if (const ObjCObjectPointerType *Ptr
+               = BaseType->getAs<ObjCObjectPointerType>())
+      BaseType = Ptr->getPointeeType();
+    else if (BaseType->isRecordType()) {
+      // Recover from arrow accesses to records, e.g.:
+      //   struct MyRecord foo;
+      //   foo->bar
+      // This is actually well-formed in C++ if MyRecord has an
+      // overloaded operator->, but that should have been dealt with
+      // by now--or a diagnostic message already issued if a problem
+      // was encountered while looking for the overloaded operator->.
+      if (!S.getLangOpts().CPlusPlus) {
+        S.Diag(OpLoc, diag::err_typecheck_member_reference_suggestion)
+          << BaseType << int(IsArrow) << BaseExpr.get()->getSourceRange()
+          << FixItHint::CreateReplacement(OpLoc, ".");
       }
+      IsArrow = false;
+    } else if (BaseType->isFunctionType()) {
+      goto fail;
+    } else {
+      S.Diag(MemberLoc, diag::err_typecheck_member_reference_arrow)
+        << BaseType << BaseExpr.get()->getSourceRange();
+      return ExprError();
     }
   }
 
@@ -1367,10 +1384,10 @@ static ExprResult LookupMemberExpr(Sema &S, LookupResult &R,
   }
 
   // Handle field access to simple records.
-  if (BaseType->getAsRecordDecl() || BaseType->isDependentType()) {
+  if (const RecordType *RTy = BaseType->getAs<RecordType>()) {
     TypoExpr *TE = nullptr;
-    if (LookupMemberExprInRecord(S, R, BaseExpr.get(), BaseType, OpLoc, IsArrow,
-                                 SS, HasTemplateArgs, TemplateKWLoc, TE))
+    if (LookupMemberExprInRecord(S, R, BaseExpr.get(), RTy, OpLoc, IsArrow, SS,
+                                 HasTemplateArgs, TemplateKWLoc, TE))
       return ExprError();
 
     // Returning valid-but-null is how we indicate to the caller that
@@ -1793,6 +1810,7 @@ ExprResult Sema::ActOnMemberAccessExpr(Scope *S, Expr *Base,
   DecomposeUnqualifiedId(Id, TemplateArgsBuffer,
                          NameInfo, TemplateArgs);
 
+  DeclarationName Name = NameInfo.getName();
   bool IsArrow = (OpKind == tok::arrow);
 
   if (getLangOpts().HLSL && IsArrow)
@@ -1806,6 +1824,13 @@ ExprResult Sema::ActOnMemberAccessExpr(Scope *S, Expr *Base,
   if (Result.isInvalid()) return ExprError();
   Base = Result.get();
 
+  if (Base->getType()->isDependentType() || Name.isDependentName() ||
+      isDependentScopeSpecifier(SS)) {
+    return ActOnDependentMemberExpr(Base, Base->getType(), IsArrow, OpLoc, SS,
+                                    TemplateKWLoc, FirstQualifierInScope,
+                                    NameInfo, TemplateArgs);
+  }
+
   ActOnMemberAccessExtraArgs ExtraArgs = {S, Id, ObjCImpDecl};
   ExprResult Res = BuildMemberReferenceExpr(
       Base, Base->getType(), OpLoc, IsArrow, SS, TemplateKWLoc,
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 003a157990d3..7d9eaf672046 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -8340,8 +8340,17 @@ void Sema::checkInitializerLifetime(const InitializedEntity &Entity,
             << Entity.getType()->isReferenceType() << CLE->getInitializer() << 2
             << DiagRange;
       } else {
-        Diag(DiagLoc, diag::warn_ret_local_temp_addr_ref)
-         << Entity.getType()->isReferenceType() << DiagRange;
+        // P2748R5: Disallow Binding a Returned Glvalue to a Temporary.
+        // [stmt.return]/p6: In a function whose return type is a reference,
+        // other than an invented function for std::is_convertible ([meta.rel]),
+        // a return statement that binds the returned reference to a temporary
+        // expression ([class.temporary]) is ill-formed.
+        if (getLangOpts().CPlusPlus26 && Entity.getType()->isReferenceType())
+          Diag(DiagLoc, diag::err_ret_local_temp_ref)
+              << Entity.getType()->isReferenceType() << DiagRange;
+        else
+          Diag(DiagLoc, diag::warn_ret_local_temp_addr_ref)
+              << Entity.getType()->isReferenceType() << DiagRange;
       }
       break;
     }
@@ -10790,8 +10799,6 @@ QualType Sema::DeduceTemplateSpecializationFromInitializer(
   // FIXME: Perform "exact type" matching first, per CWG discussion?
   //        Or implement this via an implied 'T(T) -> T' deduction guide?
 
-  // FIXME: Do we need/want a std::initializer_list<T> special case?
-
   // Look up deduction guides, including those synthesized from constructors.
   //
   // C++1z [over.match.class.deduct]p1:
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index 2f6ad49fc08b..55af414df39f 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -1282,31 +1282,6 @@ bool Sema::CppLookupName(LookupResult &R, Scope *S) {
       if (DeclContext *DC = PreS->getEntity())
         DeclareImplicitMemberFunctionsWithName(*this, Name, R.getNameLoc(), DC);
   }
-  // C++23 [temp.dep.general]p2:
-  //   The component name of an unqualified-id is dependent if
-  //   - it is a conversion-function-id whose conversion-type-id
-  //     is dependent, or
-  //   - it is operator= and the current class is a templated entity, or
-  //   - the unqualified-id is the postfix-expression in a dependent call.
-  if (Name.getNameKind() == DeclarationName::CXXConversionFunctionName &&
-      Name.getCXXNameType()->isDependentType()) {
-    R.setNotFoundInCurrentInstantiation();
-    return false;
-  }
-
-  // If this is the name of an implicitly-declared special member function,
-  // go through the scope stack to implicitly declare
-  if (isImplicitlyDeclaredMemberFunctionName(Name)) {
-    for (Scope *PreS = S; PreS; PreS = PreS->getParent())
-      if (DeclContext *DC = PreS->getEntity()) {
-        if (DC->isDependentContext() && isa<CXXRecordDecl>(DC) &&
-            Name.getCXXOverloadedOperator() == OO_Equal) {
-          R.setNotFoundInCurrentInstantiation();
-          return false;
-        }
-        DeclareImplicitMemberFunctionsWithName(*this, Name, R.getNameLoc(), DC);
-      }
-  }
 
   // Implicitly declare member functions with the name we're looking for, if in
   // fact we are in a scope where it matters.
@@ -2471,33 +2446,10 @@ bool Sema::LookupQualifiedName(LookupResult &R, DeclContext *LookupCtx,
     }
   } QL(LookupCtx);
 
-  CXXRecordDecl *LookupRec = dyn_cast<CXXRecordDecl>(LookupCtx);
-  // FIXME: Per [temp.dep.general]p2, an unqualified name is also dependent
-  // if it's a dependent conversion-function-id or operator= where the current
-  // class is a templated entity. This should be handled in LookupName.
-  if (!InUnqualifiedLookup && !R.isForRedeclaration()) {
-    // C++23 [temp.dep.type]p5:
-    //   A qualified name is dependent if
-    //   - it is a conversion-function-id whose conversion-type-id
-    //     is dependent, or
-    //   - [...]
-    //   - its lookup context is the current instantiation and it
-    //     is operator=, or
-    //   - [...]
-    if (DeclarationName Name = R.getLookupName();
-        (Name.getNameKind() == DeclarationName::CXXConversionFunctionName &&
-         Name.getCXXNameType()->isDependentType()) ||
-        (Name.getCXXOverloadedOperator() == OO_Equal && LookupRec &&
-         LookupRec->isDependentContext())) {
-      R.setNotFoundInCurrentInstantiation();
-      return false;
-    }
-  }
-
   if (LookupDirect(*this, R, LookupCtx)) {
     R.resolveKind();
-    if (LookupRec)
-      R.setNamingClass(LookupRec);
+    if (isa<CXXRecordDecl>(LookupCtx))
+      R.setNamingClass(cast<CXXRecordDecl>(LookupCtx));
     return true;
   }
 
@@ -2519,6 +2471,7 @@ bool Sema::LookupQualifiedName(LookupResult &R, DeclContext *LookupCtx,
 
   // If this isn't a C++ class, we aren't allowed to look into base
   // classes, we're done.
+  CXXRecordDecl *LookupRec = dyn_cast<CXXRecordDecl>(LookupCtx);
   if (!LookupRec || !LookupRec->getDefinition())
     return false;
 
@@ -2765,54 +2718,38 @@ bool Sema::LookupQualifiedName(LookupResult &R, DeclContext *LookupCtx,
 ///
 /// @returns True if any decls were found (but possibly ambiguous)
 bool Sema::LookupParsedName(LookupResult &R, Scope *S, CXXScopeSpec *SS,
-                            QualType ObjectType, bool AllowBuiltinCreation,
-                            bool EnteringContext) {
-  // When the scope specifier is invalid, don't even look for anything.
-  if (SS && SS->isInvalid())
+                            bool AllowBuiltinCreation, bool EnteringContext) {
+  if (SS && SS->isInvalid()) {
+    // When the scope specifier is invalid, don't even look for
+    // anything.
     return false;
+  }
 
-  // Determine where to perform name lookup
-  DeclContext *DC = nullptr;
-  bool IsDependent = false;
-  if (!ObjectType.isNull()) {
-    // This nested-name-specifier occurs in a member access expression, e.g.,
-    // x->B::f, and we are looking into the type of the object.
-    assert((!SS || SS->isEmpty()) &&
-           "ObjectType and scope specifier cannot coexist");
-    DC = computeDeclContext(ObjectType);
-    IsDependent = !DC && ObjectType->isDependentType();
-    assert(((!DC && ObjectType->isDependentType()) ||
-            !ObjectType->isIncompleteType() || !ObjectType->getAs<TagType>() ||
-            ObjectType->castAs<TagType>()->isBeingDefined()) &&
-           "Caller should have completed object type");
-  } else if (SS && SS->isNotEmpty()) {
-    if (NestedNameSpecifier *NNS = SS->getScopeRep();
-        NNS->getKind() == NestedNameSpecifier::Super)
+  if (SS && SS->isSet()) {
+    NestedNameSpecifier *NNS = SS->getScopeRep();
+    if (NNS->getKind() == NestedNameSpecifier::Super)
       return LookupInSuper(R, NNS->getAsRecordDecl());
-    // This nested-name-specifier occurs after another nested-name-specifier,
-    // so long into the context associated with the prior nested-name-specifier.
-    if ((DC = computeDeclContext(*SS, EnteringContext))) {
-      // The declaration context must be complete.
+
+    if (DeclContext *DC = computeDeclContext(*SS, EnteringContext)) {
+      // We have resolved the scope specifier to a particular declaration
+      // contex, and will perform name lookup in that context.
       if (!DC->isDependentContext() && RequireCompleteDeclContext(*SS, DC))
         return false;
+
       R.setContextRange(SS->getRange());
+      return LookupQualifiedName(R, DC);
     }
-    IsDependent = !DC && isDependentScopeSpecifier(*SS);
-  } else {
-    // Perform unqualified name lookup starting in the given scope.
-    return LookupName(R, S, AllowBuiltinCreation);
-  }
 
-  // If we were able to compute a declaration context, perform qualified name
-  // lookup in that context.
-  if (DC)
-    return LookupQualifiedName(R, DC);
-  else if (IsDependent)
     // We could not resolve the scope specified to a specific declaration
     // context, which means that SS refers to an unknown specialization.
     // Name lookup can't find anything in this case.
     R.setNotFoundInCurrentInstantiation();
-  return false;
+    R.setContextRange(SS->getRange());
+    return false;
+  }
+
+  // Perform unqualified name lookup starting in the given scope.
+  return LookupName(R, S, AllowBuiltinCreation);
 }
 
 /// Perform qualified name lookup into all base classes of the given
@@ -5081,9 +5018,8 @@ static void LookupPotentialTypoResult(Sema &SemaRef,
     return;
   }
 
-  SemaRef.LookupParsedName(Res, S, SS,
-                           /*ObjectType=*/QualType(),
-                           /*AllowBuiltinCreation=*/false, EnteringContext);
+  SemaRef.LookupParsedName(Res, S, SS, /*AllowBuiltinCreation=*/false,
+                           EnteringContext);
 
   // Fake ivar lookup; this should really be part of
   // LookupParsedName.
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index cf5447f223d4..cee8da495c54 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -3061,9 +3061,7 @@ ExprResult SemaOpenMP::ActOnOpenMPIdExpression(Scope *CurScope,
                                                OpenMPDirectiveKind Kind) {
   ASTContext &Context = getASTContext();
   LookupResult Lookup(SemaRef, Id, Sema::LookupOrdinaryName);
-  SemaRef.LookupParsedName(Lookup, CurScope, &ScopeSpec,
-                           /*ObjectType=*/QualType(),
-                           /*AllowBuiltinCreation=*/true);
+  SemaRef.LookupParsedName(Lookup, CurScope, &ScopeSpec, true);
 
   if (Lookup.isAmbiguous())
     return ExprError();
@@ -7409,8 +7407,7 @@ void SemaOpenMP::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(
   const IdentifierInfo *BaseII = D.getIdentifier();
   LookupResult Lookup(SemaRef, DeclarationName(BaseII), D.getIdentifierLoc(),
                       Sema::LookupOrdinaryName);
-  SemaRef.LookupParsedName(Lookup, S, &D.getCXXScopeSpec(),
-                           /*ObjectType=*/QualType());
+  SemaRef.LookupParsedName(Lookup, S, &D.getCXXScopeSpec());
 
   TypeSourceInfo *TInfo = SemaRef.GetTypeForDeclarator(D);
   QualType FType = TInfo->getType();
@@ -19314,8 +19311,7 @@ buildDeclareReductionRef(Sema &SemaRef, SourceLocation Loc, SourceRange Range,
   if (S) {
     LookupResult Lookup(SemaRef, ReductionId, Sema::LookupOMPReductionName);
     Lookup.suppressDiagnostics();
-    while (S && SemaRef.LookupParsedName(Lookup, S, &ReductionIdScopeSpec,
-                                         /*ObjectType=*/QualType())) {
+    while (S && SemaRef.LookupParsedName(Lookup, S, &ReductionIdScopeSpec)) {
       NamedDecl *D = Lookup.getRepresentativeDecl();
       do {
         S = S->getParent();
@@ -22184,8 +22180,7 @@ static ExprResult buildUserDefinedMapperRef(Sema &SemaRef, Scope *S,
   LookupResult Lookup(SemaRef, MapperId, Sema::LookupOMPMapperName);
   Lookup.suppressDiagnostics();
   if (S) {
-    while (S && SemaRef.LookupParsedName(Lookup, S, &MapperIdScopeSpec,
-                                         /*ObjectType=*/QualType())) {
+    while (S && SemaRef.LookupParsedName(Lookup, S, &MapperIdScopeSpec)) {
       NamedDecl *D = Lookup.getRepresentativeDecl();
       while (S && !S->isDeclScope(D))
         S = S->getParent();
@@ -23502,9 +23497,7 @@ void SemaOpenMP::DiagnoseUnterminatedOpenMPDeclareTarget() {
 NamedDecl *SemaOpenMP::lookupOpenMPDeclareTargetName(
     Scope *CurScope, CXXScopeSpec &ScopeSpec, const DeclarationNameInfo &Id) {
   LookupResult Lookup(SemaRef, Id, Sema::LookupOrdinaryName);
-  SemaRef.LookupParsedName(Lookup, CurScope, &ScopeSpec,
-                           /*ObjectType=*/QualType(),
-                           /*AllowBuiltinCreation=*/true);
+  SemaRef.LookupParsedName(Lookup, CurScope, &ScopeSpec, true);
 
   if (Lookup.isAmbiguous())
     return nullptr;
diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp
index 9d44c22c8ddc..1c84830b6ddd 100644
--- a/clang/lib/Sema/SemaStmtAttr.cpp
+++ b/clang/lib/Sema/SemaStmtAttr.cpp
@@ -109,16 +109,14 @@ static Attr *handleLoopHintAttr(Sema &S, Stmt *St, const ParsedAttr &A,
     SetHints(LoopHintAttr::Unroll, LoopHintAttr::Disable);
   } else if (PragmaName == "unroll") {
     // #pragma unroll N
-    if (ValueExpr && !ValueExpr->isValueDependent()) {
-      llvm::APSInt ValueAPS;
-      ExprResult R = S.VerifyIntegerConstantExpression(ValueExpr, &ValueAPS);
-      assert(!R.isInvalid() && "unroll count value must be a valid value, it's "
-                               "should be checked in Sema::CheckLoopHintExpr");
-      (void)R;
-      // The values of 0 and 1 block any unrolling of the loop.
-      if (ValueAPS.isZero() || ValueAPS.isOne())
-        SetHints(LoopHintAttr::UnrollCount, LoopHintAttr::Disable);
-      else
+    if (ValueExpr) {
+      if (!ValueExpr->isValueDependent()) {
+        auto Value = ValueExpr->EvaluateKnownConstInt(S.getASTContext());
+        if (Value.isZero() || Value.isOne())
+          SetHints(LoopHintAttr::Unroll, LoopHintAttr::Disable);
+        else
+          SetHints(LoopHintAttr::UnrollCount, LoopHintAttr::Numeric);
+      } else
         SetHints(LoopHintAttr::UnrollCount, LoopHintAttr::Numeric);
     } else
       SetHints(LoopHintAttr::Unroll, LoopHintAttr::Enable);
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 72bf6370ca82..bbcb7c33a985 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -210,11 +210,10 @@ TemplateNameKind Sema::isTemplateName(Scope *S,
   AssumedTemplateKind AssumedTemplate;
   LookupResult R(*this, TName, Name.getBeginLoc(), LookupOrdinaryName);
   if (LookupTemplateName(R, S, SS, ObjectType, EnteringContext,
-                         /*RequiredTemplate=*/SourceLocation(),
+                         MemberOfUnknownSpecialization, SourceLocation(),
                          &AssumedTemplate,
                          /*AllowTypoCorrection=*/!Disambiguation))
     return TNK_Non_template;
-  MemberOfUnknownSpecialization = R.wasNotFoundInCurrentInstantiation();
 
   if (AssumedTemplate != AssumedTemplateKind::None) {
     TemplateResult = TemplateTy::make(Context.getAssumedTemplateName(TName));
@@ -321,12 +320,15 @@ TemplateNameKind Sema::isTemplateName(Scope *S,
 bool Sema::isDeductionGuideName(Scope *S, const IdentifierInfo &Name,
                                 SourceLocation NameLoc, CXXScopeSpec &SS,
                                 ParsedTemplateTy *Template /*=nullptr*/) {
+  bool MemberOfUnknownSpecialization = false;
+
   // We could use redeclaration lookup here, but we don't need to: the
   // syntactic form of a deduction guide is enough to identify it even
   // if we can't look up the template name at all.
   LookupResult R(*this, DeclarationName(&Name), NameLoc, LookupOrdinaryName);
   if (LookupTemplateName(R, S, SS, /*ObjectType*/ QualType(),
-                         /*EnteringContext*/ false))
+                         /*EnteringContext*/ false,
+                         MemberOfUnknownSpecialization))
     return false;
 
   if (R.empty()) return false;
@@ -372,8 +374,11 @@ bool Sema::DiagnoseUnknownTemplateName(const IdentifierInfo &II,
   return true;
 }
 
-bool Sema::LookupTemplateName(LookupResult &Found, Scope *S, CXXScopeSpec &SS,
-                              QualType ObjectType, bool EnteringContext,
+bool Sema::LookupTemplateName(LookupResult &Found,
+                              Scope *S, CXXScopeSpec &SS,
+                              QualType ObjectType,
+                              bool EnteringContext,
+                              bool &MemberOfUnknownSpecialization,
                               RequiredTemplateKind RequiredTemplate,
                               AssumedTemplateKind *ATK,
                               bool AllowTypoCorrection) {
@@ -386,6 +391,7 @@ bool Sema::LookupTemplateName(LookupResult &Found, Scope *S, CXXScopeSpec &SS,
   Found.setTemplateNameLookup(true);
 
   // Determine where to perform name lookup
+  MemberOfUnknownSpecialization = false;
   DeclContext *LookupCtx = nullptr;
   bool IsDependent = false;
   if (!ObjectType.isNull()) {
@@ -542,7 +548,7 @@ bool Sema::LookupTemplateName(LookupResult &Found, Scope *S, CXXScopeSpec &SS,
   FilterAcceptableTemplateNames(Found, AllowFunctionTemplatesInLookup);
   if (Found.empty()) {
     if (IsDependent) {
-      Found.setNotFoundInCurrentInstantiation();
+      MemberOfUnknownSpecialization = true;
       return false;
     }
 
@@ -5589,9 +5595,11 @@ Sema::BuildQualifiedTemplateIdExpr(CXXScopeSpec &SS,
       RequireCompleteDeclContext(SS, DC))
     return BuildDependentDeclRefExpr(SS, TemplateKWLoc, NameInfo, TemplateArgs);
 
+  bool MemberOfUnknownSpecialization;
   LookupResult R(*this, NameInfo, LookupOrdinaryName);
   if (LookupTemplateName(R, (Scope *)nullptr, SS, QualType(),
-                         /*Entering*/ false, TemplateKWLoc))
+                         /*Entering*/false, MemberOfUnknownSpecialization,
+                         TemplateKWLoc))
     return ExprError();
 
   if (R.isAmbiguous())
@@ -5712,13 +5720,14 @@ TemplateNameKind Sema::ActOnTemplateName(Scope *S,
     DeclarationNameInfo DNI = GetNameFromUnqualifiedId(Name);
     LookupResult R(*this, DNI.getName(), Name.getBeginLoc(),
                    LookupOrdinaryName);
+    bool MOUS;
     // Tell LookupTemplateName that we require a template so that it diagnoses
     // cases where it finds a non-template.
     RequiredTemplateKind RTK = TemplateKWLoc.isValid()
                                    ? RequiredTemplateKind(TemplateKWLoc)
                                    : TemplateNameIsRequired;
-    if (!LookupTemplateName(R, S, SS, ObjectType.get(), EnteringContext, RTK,
-                            /*ATK=*/nullptr, /*AllowTypoCorrection=*/false) &&
+    if (!LookupTemplateName(R, S, SS, ObjectType.get(), EnteringContext, MOUS,
+                            RTK, nullptr, /*AllowTypoCorrection=*/false) &&
         !R.isAmbiguous()) {
       if (LookupCtx)
         Diag(Name.getBeginLoc(), diag::err_no_member)
@@ -5807,7 +5816,7 @@ bool Sema::CheckTemplateTypeArgument(
 
     if (auto *II = NameInfo.getName().getAsIdentifierInfo()) {
       LookupResult Result(*this, NameInfo, LookupOrdinaryName);
-      LookupParsedName(Result, CurScope, &SS, /*ObjectType=*/QualType());
+      LookupParsedName(Result, CurScope, &SS);
 
       if (Result.getAsSingle<TypeDecl>() ||
           Result.getResultKind() ==
@@ -11170,8 +11179,7 @@ DeclResult Sema::ActOnExplicitInstantiation(Scope *S,
                            : TSK_ExplicitInstantiationDeclaration;
 
   LookupResult Previous(*this, NameInfo, LookupOrdinaryName);
-  LookupParsedName(Previous, S, &D.getCXXScopeSpec(),
-                   /*ObjectType=*/QualType());
+  LookupParsedName(Previous, S, &D.getCXXScopeSpec());
 
   if (!R->isFunctionType()) {
     // C++ [temp.explicit]p1:
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 98d5c7cb3a8a..3a9fd906b7af 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -2151,13 +2151,25 @@ TemplateInstantiator::TransformLoopHintAttr(const LoopHintAttr *LH) {
 
   // Generate error if there is a problem with the value.
   if (getSema().CheckLoopHintExpr(TransformedExpr, LH->getLocation(),
-                                  LH->getOption() == LoopHintAttr::UnrollCount))
+                                  LH->getSemanticSpelling() ==
+                                      LoopHintAttr::Pragma_unroll))
     return LH;
 
+  LoopHintAttr::OptionType Option = LH->getOption();
+  LoopHintAttr::LoopHintState State = LH->getState();
+
+  llvm::APSInt ValueAPS =
+      TransformedExpr->EvaluateKnownConstInt(getSema().getASTContext());
+  // The values of 0 and 1 block any unrolling of the loop.
+  if (ValueAPS.isZero() || ValueAPS.isOne()) {
+    Option = LoopHintAttr::Unroll;
+    State = LoopHintAttr::Disable;
+  }
+
   // Create new LoopHintValueAttr with integral expression in place of the
   // non-type template parameter.
-  return LoopHintAttr::CreateImplicit(getSema().Context, LH->getOption(),
-                                      LH->getState(), TransformedExpr, *LH);
+  return LoopHintAttr::CreateImplicit(getSema().Context, Option, State,
+                                      TransformedExpr, *LH);
 }
 const NoInlineAttr *TemplateInstantiator::TransformStmtNoInlineAttr(
     const Stmt *OrigS, const Stmt *InstS, const NoInlineAttr *A) {
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 28d3d1b79a74..f47bc219e6fa 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -13217,26 +13217,6 @@ bool TreeTransform<Derived>::TransformOverloadExprDecls(OverloadExpr *Old,
   // Resolve a kind, but don't do any further analysis.  If it's
   // ambiguous, the callee needs to deal with it.
   R.resolveKind();
-
-  if (Old->hasTemplateKeyword() && !R.empty()) {
-    NamedDecl *FoundDecl = R.getRepresentativeDecl()->getUnderlyingDecl();
-    getSema().FilterAcceptableTemplateNames(R,
-                                            /*AllowFunctionTemplates=*/true,
-                                            /*AllowDependent=*/true);
-    if (R.empty()) {
-      // If a 'template' keyword was used, a lookup that finds only non-template
-      // names is an error.
-      getSema().Diag(R.getNameLoc(),
-                     diag::err_template_kw_refers_to_non_template)
-          << R.getLookupName() << Old->getQualifierLoc().getSourceRange()
-          << Old->hasTemplateKeyword() << Old->getTemplateKeywordLoc();
-      getSema().Diag(FoundDecl->getLocation(),
-                     diag::note_template_kw_refers_to_non_template)
-          << R.getLookupName();
-      return true;
-    }
-  }
-
   return false;
 }
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
index 9ed8e7cab6ab..ec1db1cc3358 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
@@ -50,6 +50,9 @@ std::optional<bool> isUncounted(const clang::CXXRecordDecl* Class);
 /// class, false if not, std::nullopt if inconclusive.
 std::optional<bool> isUncountedPtr(const clang::Type* T);
 
+/// \returns true if Name is a RefPtr, Ref, or its variant, false if not.
+bool isRefType(const std::string &Name);
+
 /// \returns true if \p F creates ref-countable object from uncounted parameter,
 /// false if not.
 bool isCtorOfRefCounted(const clang::FunctionDecl *F);
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
index 8b41a949fd67..ae494de58da3 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
@@ -53,6 +53,13 @@ public:
       bool shouldVisitTemplateInstantiations() const { return true; }
       bool shouldVisitImplicitCode() const { return false; }
 
+      bool TraverseClassTemplateDecl(ClassTemplateDecl *Decl) {
+        if (isRefType(safeGetName(Decl)))
+          return true;
+        return RecursiveASTVisitor<LocalVisitor>::TraverseClassTemplateDecl(
+            Decl);
+      }
+
       bool VisitCallExpr(const CallExpr *CE) {
         Checker->visitCallExpr(CE);
         return true;
diff --git a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes
index 5dbb83cab86b..b0eead42869a 100644
--- a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes
+++ b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes
@@ -7,3 +7,7 @@ Tags:
   SwiftImportAs: reference
   SwiftReleaseOp: RCRelease
   SwiftRetainOp: RCRetain
+- Name: NonCopyableType
+  SwiftCopyable: false
+- Name: CopyableType
+  SwiftCopyable: true
diff --git a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h
index 82b8a6749c4f..a8f6d0248eae 100644
--- a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h
+++ b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.h
@@ -4,3 +4,6 @@ struct RefCountedType { int value; };
 
 inline void RCRetain(RefCountedType *x) { x->value++; }
 inline void RCRelease(RefCountedType *x) { x->value--; }
+
+struct NonCopyableType { int value; };
+struct CopyableType { int value; };
diff --git a/clang/test/APINotes/swift-import-as.cpp b/clang/test/APINotes/swift-import-as.cpp
index 904857e58593..103cf02f431a 100644
--- a/clang/test/APINotes/swift-import-as.cpp
+++ b/clang/test/APINotes/swift-import-as.cpp
@@ -2,6 +2,8 @@
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter ImmortalRefType | FileCheck -check-prefix=CHECK-IMMORTAL %s
 // RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter RefCountedType | FileCheck -check-prefix=CHECK-REF-COUNTED %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter NonCopyableType | FileCheck -check-prefix=CHECK-NON-COPYABLE %s
+// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers %s -x c++ -ast-dump -ast-dump-filter CopyableType | FileCheck -check-prefix=CHECK-COPYABLE %s
 
 #include <SwiftImportAs.h>
 
@@ -14,3 +16,11 @@
 // CHECK-REF-COUNTED: SwiftAttrAttr {{.+}} <<invalid sloc>> "import_reference"
 // CHECK-REF-COUNTED: SwiftAttrAttr {{.+}} <<invalid sloc>> "retain:RCRetain"
 // CHECK-REF-COUNTED: SwiftAttrAttr {{.+}} <<invalid sloc>> "release:RCRelease"
+
+// CHECK-NON-COPYABLE: Dumping NonCopyableType:
+// CHECK-NON-COPYABLE-NEXT: CXXRecordDecl {{.+}} imported in SwiftImportAs {{.+}} struct NonCopyableType
+// CHECK-NON-COPYABLE: SwiftAttrAttr {{.+}} <<invalid sloc>> "~Copyable"
+
+// CHECK-COPYABLE: Dumping CopyableType:
+// CHECK-COPYABLE-NEXT: CXXRecordDecl {{.+}} imported in SwiftImportAs {{.+}} struct CopyableType
+// CHECK-COPYABLE-NOT: SwiftAttrAttr
diff --git a/clang/test/AST/HLSL/this-reference-template.hlsl b/clang/test/AST/HLSL/this-reference-template.hlsl
index d427e73044b7..60e057986ebf 100644
--- a/clang/test/AST/HLSL/this-reference-template.hlsl
+++ b/clang/test/AST/HLSL/this-reference-template.hlsl
@@ -24,7 +24,7 @@ void main() {
 // CHECK:     -CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <line:8:3, line:10:3> line:8:5 getFirst 'K ()' implicit-inline
 // CHECK-NEXT:-CompoundStmt 0x{{[0-9A-Fa-f]+}} <col:16, line:10:3>
 // CHECK-NEXT:-ReturnStmt 0x{{[0-9A-Fa-f]+}} <line:9:4, col:16>
-// CHECK-NEXT:-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:11, col:16> 'K' lvalue .First 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT:-CXXDependentScopeMemberExpr 0x{{[0-9A-Fa-f]+}} <col:11, col:16> '<dependent type>' lvalue .First
 // CHECK-NEXT:-CXXThisExpr 0x{{[0-9A-Fa-f]+}} <col:11> 'Pair<K, V>' lvalue this
 // CHECK-NEXT:-CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <line:12:3, line:14:3> line:12:5 getSecond 'V ()' implicit-inline
 // CHECK-NEXT:-CompoundStmt 0x{{[0-9A-Fa-f]+}} <col:17, line:14:3>
diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index a5951158ed0e..207da5fe8126 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -263,3 +263,10 @@ const int *p = &b;
 const __int128 K = (__int128)(int*)0;
 const unsigned __int128 KU = (unsigned __int128)(int*)0;
 #endif
+
+
+int test3(void) {
+  int a[2];
+  a[0] = test3; // all-error {{incompatible pointer to integer conversion assigning to 'int' from 'int (void)'}}
+  return 0;
+}
diff --git a/clang/test/AST/Interp/cxx23.cpp b/clang/test/AST/Interp/cxx23.cpp
index f0325eef6d87..13cc9f43febc 100644
--- a/clang/test/AST/Interp/cxx23.cpp
+++ b/clang/test/AST/Interp/cxx23.cpp
@@ -141,3 +141,19 @@ struct check_ice {
     };
 };
 static_assert(check_ice<42>::x == 42);
+
+
+namespace VirtualBases {
+  namespace One {
+    struct U { int n; };
+    struct V : U { int n; };
+    struct A : virtual V { int n; };
+    struct Aa { int n; };
+    struct B : virtual A, Aa {};
+    struct C : virtual A, Aa {};
+    struct D : B, C {};
+
+    /// Calls the constructor of D.
+    D d;
+  }
+}
diff --git a/clang/test/AST/Interp/functions.cpp b/clang/test/AST/Interp/functions.cpp
index f9bb5d53634e..a5bb9f1a19aa 100644
--- a/clang/test/AST/Interp/functions.cpp
+++ b/clang/test/AST/Interp/functions.cpp
@@ -601,3 +601,19 @@ namespace FromIntegral {
                                            // both-warning {{variable length arrays}}
 #endif
 }
+
+namespace {
+  template <typename T> using id = T;
+  template <typename T>
+  constexpr void g() {
+    constexpr id<void (T)> f;
+  }
+
+  static_assert((g<int>(), true), "");
+}
+
+namespace {
+  /// The InitListExpr here is of void type.
+  void bir [[clang::annotate("B", {1, 2, 3, 4})]] (); // both-error {{'annotate' attribute requires parameter 1 to be a constant expression}} \
+                                                      // both-note {{subexpression not valid in a constant expression}}
+}
diff --git a/clang/test/AST/Interp/opencl.cl b/clang/test/AST/Interp/opencl.cl
new file mode 100644
index 000000000000..b9ba4f8b9b55
--- /dev/null
+++ b/clang/test/AST/Interp/opencl.cl
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -fsyntax-only -verify=ref,both %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,both %s -fexperimental-new-constant-interpreter
+
+// both-no-diagnostics
+
+typedef int int2 __attribute__((ext_vector_type(2)));
+typedef int int3 __attribute__((ext_vector_type(3)));
+typedef int int4 __attribute__((ext_vector_type(4)));
+typedef int int8 __attribute__((ext_vector_type(8)));
+typedef int int16 __attribute__((ext_vector_type(16)));
+
+void foo(int3 arg1, int8 arg2) {
+  int4 auto1;
+  int16 *auto2;
+  int auto3;
+  int2 auto4;
+  struct S *incomplete1;
+
+  int res1[vec_step(arg1) == 4 ? 1 : -1];
+  int res2[vec_step(arg2) == 8 ? 1 : -1];
+  int res3[vec_step(auto1) == 4 ? 1 : -1];
+  int res4[vec_step(*auto2) == 16 ? 1 : -1];
+  int res5[vec_step(auto3) == 1 ? 1 : -1];
+  int res6[vec_step(auto4) == 2 ? 1 : -1];
+  int res7[vec_step(int2) == 2 ? 1 : -1];
+  int res8[vec_step(int3) == 4 ? 1 : -1];
+  int res9[vec_step(int4) == 4 ? 1 : -1];
+  int res10[vec_step(int8) == 8 ? 1 : -1];
+  int res11[vec_step(int16) == 16 ? 1 : -1];
+  int res12[vec_step(void) == 1 ? 1 : -1];
+}
+
diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp
index 3e52354a4a10..771e5adfca34 100644
--- a/clang/test/AST/Interp/records.cpp
+++ b/clang/test/AST/Interp/records.cpp
@@ -90,8 +90,7 @@ struct Ints2 {
   int a = 10;
   int b;
 };
-constexpr Ints2 ints22; // both-error {{without a user-provided default constructor}} \
-                        // expected-error {{must be initialized by a constant expression}}
+constexpr Ints2 ints22; // both-error {{without a user-provided default constructor}}
 
 constexpr Ints2 I2 = Ints2{12, 25};
 static_assert(I2.a == 12, "");
@@ -1031,6 +1030,12 @@ namespace ParenInit {
                      // both-note {{required by 'constinit' specifier}} \
                      // both-note {{reference to temporary is not a constant expression}} \
                      // both-note {{temporary created here}}
+
+
+  /// Initializing an array.
+  constexpr void bar(int i, int j) {
+    int arr[4](i, j);
+  }
 }
 #endif
 
@@ -1330,3 +1335,108 @@ namespace UnnamedBitFields {
   static_assert(a.f == 1.0, "");
   static_assert(a.c == 'a', "");
 }
+
+/// FIXME: This still doesn't work in the new interpreter because
+/// we lack type information for dummy pointers.
+namespace VirtualBases {
+  /// This used to crash.
+  namespace One {
+    class A {
+    protected:
+      int x;
+    };
+    class B : public virtual A {
+    public:
+      int getX() { return x; } // ref-note {{declared here}}
+    };
+
+    class DV : virtual public B{};
+
+    void foo() {
+      DV b;
+      int a[b.getX()]; // both-warning {{variable length arrays}} \
+                       // ref-note {{non-constexpr function 'getX' cannot be used}}
+    }
+  }
+
+  namespace Two {
+    struct U { int n; };
+    struct A : virtual U { int n; };
+    struct B : A {};
+    B a;
+    static_assert((U*)(A*)(&a) == (U*)(&a), "");
+
+    struct C : virtual A {};
+    struct D : B, C {};
+    D d;
+    constexpr B *p = &d;
+    constexpr C *q = &d;
+    static_assert((A*)p == (A*)q, ""); // both-error {{failed}}
+  }
+
+  namespace Three {
+    struct U { int n; };
+    struct V : U { int n; };
+    struct A : virtual V { int n; };
+    struct Aa { int n; };
+    struct B : virtual A, Aa {};
+
+    struct C : virtual A, Aa {};
+
+    struct D : B, C {};
+
+    D d;
+
+    constexpr B *p = &d;
+    constexpr C *q = &d;
+
+    static_assert((void*)p != (void*)q, "");
+    static_assert((A*)p == (A*)q, "");
+    static_assert((Aa*)p != (Aa*)q, "");
+
+    constexpr V *v = p;
+    constexpr V *w = q;
+    constexpr V *x = (A*)p;
+    static_assert(v == w, "");
+    static_assert(v == x, "");
+
+    static_assert((U*)&d == p, "");
+    static_assert((U*)&d == q, "");
+    static_assert((U*)&d == v, "");
+    static_assert((U*)&d == w, "");
+    static_assert((U*)&d == x, "");
+
+    struct X {};
+    struct Y1 : virtual X {};
+    struct Y2 : X {};
+    struct Z : Y1, Y2 {};
+    Z z;
+    static_assert((X*)(Y1*)&z != (X*)(Y2*)&z, "");
+  }
+}
+
+namespace ZeroInit {
+  struct S3 {
+    S3() = default;
+    S3(const S3&) = default;
+    S3(S3&&) = default;
+    constexpr S3(int n) : n(n) {}
+    int n;
+  };
+  constexpr S3 s3d; // both-error {{default initialization of an object of const type 'const S3' without a user-provided default constructor}}
+  static_assert(s3d.n == 0, "");
+}
+
+namespace {
+#if __cplusplus >= 202002L
+  struct C {
+    template <unsigned N> constexpr C(const char (&)[N]) : n(N) {}
+    unsigned n;
+  };
+  template <C c>
+  constexpr auto operator""_c() { return c.n; }
+
+  constexpr auto waldo = "abc"_c;
+  static_assert(waldo == 4, "");
+#endif
+}
diff --git a/clang/test/AST/ast-dump-macro-json.c b/clang/test/AST/ast-dump-macro-json.c
index 96f4be6fec3d..fb9b4118b4f1 100644
--- a/clang/test/AST/ast-dump-macro-json.c
+++ b/clang/test/AST/ast-dump-macro-json.c
@@ -132,7 +132,7 @@ void BLAP(foo, __COUNTER__)(void);
 // CHECK-NEXT:   "spellingLoc": {
 // CHECK-NEXT:    "offset": {{[0-9]+}},
 // CHECK-NEXT:    "file": "<scratch space>",
-// CHECK-NEXT:    "line": 3,
+// CHECK-NEXT:    "line": 5,
 // CHECK-NEXT:    "col": 1,
 // CHECK-NEXT:    "tokLen": 4
 // CHECK-NEXT:   },
@@ -169,7 +169,7 @@ void BLAP(foo, __COUNTER__)(void);
 // CHECK-NEXT:   "spellingLoc": {
 // CHECK-NEXT:    "offset": {{[0-9]+}},
 // CHECK-NEXT:    "file": "<scratch space>",
-// CHECK-NEXT:    "line": 5,
+// CHECK-NEXT:    "line": 7,
 // CHECK-NEXT:    "col": 1,
 // CHECK-NEXT:    "tokLen": 4
 // CHECK-NEXT:   },
diff --git a/clang/test/AST/ast-dump-pragma-unroll.cpp b/clang/test/AST/ast-dump-pragma-unroll.cpp
new file mode 100644
index 000000000000..f9c254b803ff
--- /dev/null
+++ b/clang/test/AST/ast-dump-pragma-unroll.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -ast-dump %s | FileCheck %s
+
+using size_t = unsigned long long;
+
+// CHECK: LoopHintAttr {{.*}} Implicit unroll UnrollCount Numeric
+// CHECK: LoopHintAttr {{.*}} Implicit unroll UnrollCount Numeric
+// CHECK: LoopHintAttr {{.*}} Implicit unroll Unroll Disable
+// CHECK: LoopHintAttr {{.*}} Implicit unroll Unroll Disable
+template <bool Flag>
+int value_dependent(int n) {
+  constexpr int N = 100;
+  auto init = [=]() { return Flag ? n : 0UL; };
+  auto cond = [=](size_t ix) { return Flag ? ix != 0 : ix < 10; };
+  auto iter = [=](size_t ix) {
+    return Flag ? ix & ~(1ULL << __builtin_clzll(ix)) : ix + 1;
+  };
+
+#pragma unroll Flag ? 1 : N
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    n *= n;
+  }
+#pragma unroll Flag ? 0 : N
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    n *= n;
+  }
+  return n;
+}
+
+void test_value_dependent(int n) {
+  value_dependent<true>(n);
+}
diff --git a/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp b/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp
index cf740516db6f..5ac55d269dce 100644
--- a/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp
+++ b/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp
@@ -1846,6 +1846,42 @@ int main()
 // CHECK-NEXT:              "kind": "VarTemplateDecl",
 // CHECK-NEXT:              "name": "is_const_v"
 // CHECK-NEXT:             }
+// CHECK-NEXT:            ],
+// CHECK-NEXT:            "inner": [
+// CHECK-NEXT:             {
+// CHECK-NEXT:              "kind": "TemplateArgument",
+// CHECK-NEXT:              "type": {
+// CHECK-NEXT:               "qualType": "const _Ty"
+// CHECK-NEXT:              },
+// CHECK-NEXT:              "inner": [
+// CHECK-NEXT:               {
+// CHECK-NEXT:                "id": "0x{{.*}}",
+// CHECK-NEXT:                "kind": "QualType",
+// CHECK-NEXT:                "type": {
+// CHECK-NEXT:                 "qualType": "const _Ty"
+// CHECK-NEXT:                },
+// CHECK-NEXT:                "qualifiers": "const",
+// CHECK-NEXT:                "inner": [
+// CHECK-NEXT:                 {
+// CHECK-NEXT:                  "id": "0x{{.*}}",
+// CHECK-NEXT:                  "kind": "TemplateTypeParmType",
+// CHECK-NEXT:                  "type": {
+// CHECK-NEXT:                   "qualType": "_Ty"
+// CHECK-NEXT:                  },
+// CHECK-NEXT:                  "isDependent": true,
+// CHECK-NEXT:                  "isInstantiationDependent": true,
+// CHECK-NEXT:                  "depth": 0,
+// CHECK-NEXT:                  "index": 0,
+// CHECK-NEXT:                  "decl": {
+// CHECK-NEXT:                   "id": "0x{{.*}}",
+// CHECK-NEXT:                   "kind": "TemplateTypeParmDecl",
+// CHECK-NEXT:                   "name": "_Ty"
+// CHECK-NEXT:                  }
+// CHECK-NEXT:                 }
+// CHECK-NEXT:                ]
+// CHECK-NEXT:               }
+// CHECK-NEXT:              ]
+// CHECK-NEXT:             }
 // CHECK-NEXT:            ]
 // CHECK-NEXT:           }
 // CHECK-NEXT:          ]
@@ -1900,6 +1936,32 @@ int main()
 // CHECK-NEXT:              "kind": "VarTemplateDecl",
 // CHECK-NEXT:              "name": "is_reference_v"
 // CHECK-NEXT:             }
+// CHECK-NEXT:            ],
+// CHECK-NEXT:            "inner": [
+// CHECK-NEXT:             {
+// CHECK-NEXT:              "kind": "TemplateArgument",
+// CHECK-NEXT:              "type": {
+// CHECK-NEXT:               "qualType": "_Ty"
+// CHECK-NEXT:              },
+// CHECK-NEXT:              "inner": [
+// CHECK-NEXT:               {
+// CHECK-NEXT:                "id": "0x{{.*}}",
+// CHECK-NEXT:                "kind": "TemplateTypeParmType",
+// CHECK-NEXT:                "type": {
+// CHECK-NEXT:                 "qualType": "_Ty"
+// CHECK-NEXT:                },
+// CHECK-NEXT:                "isDependent": true,
+// CHECK-NEXT:                "isInstantiationDependent": true,
+// CHECK-NEXT:                "depth": 0,
+// CHECK-NEXT:                "index": 0,
+// CHECK-NEXT:                "decl": {
+// CHECK-NEXT:                 "id": "0x{{.*}}",
+// CHECK-NEXT:                 "kind": "TemplateTypeParmDecl",
+// CHECK-NEXT:                 "name": "_Ty"
+// CHECK-NEXT:                }
+// CHECK-NEXT:               }
+// CHECK-NEXT:              ]
+// CHECK-NEXT:             }
 // CHECK-NEXT:            ]
 // CHECK-NEXT:           }
 // CHECK-NEXT:          ]
@@ -2565,6 +2627,32 @@ int main()
 // CHECK-NEXT:            "kind": "VarTemplateDecl",
 // CHECK-NEXT:            "name": "is_function_v"
 // CHECK-NEXT:           }
+// CHECK-NEXT:          ],
+// CHECK-NEXT:          "inner": [
+// CHECK-NEXT:           {
+// CHECK-NEXT:            "kind": "TemplateArgument",
+// CHECK-NEXT:            "type": {
+// CHECK-NEXT:             "qualType": "_Ty1"
+// CHECK-NEXT:            },
+// CHECK-NEXT:            "inner": [
+// CHECK-NEXT:             {
+// CHECK-NEXT:              "id": "0x{{.*}}",
+// CHECK-NEXT:              "kind": "TemplateTypeParmType",
+// CHECK-NEXT:              "type": {
+// CHECK-NEXT:               "qualType": "_Ty1"
+// CHECK-NEXT:              },
+// CHECK-NEXT:              "isDependent": true,
+// CHECK-NEXT:              "isInstantiationDependent": true,
+// CHECK-NEXT:              "depth": 0,
+// CHECK-NEXT:              "index": 0,
+// CHECK-NEXT:              "decl": {
+// CHECK-NEXT:               "id": "0x{{.*}}",
+// CHECK-NEXT:               "kind": "TemplateTypeParmDecl",
+// CHECK-NEXT:               "name": "_Ty1"
+// CHECK-NEXT:              }
+// CHECK-NEXT:             }
+// CHECK-NEXT:            ]
+// CHECK-NEXT:           }
 // CHECK-NEXT:          ]
 // CHECK-NEXT:         }
 // CHECK-NEXT:        ]
diff --git a/clang/test/AST/ast-dump-templates.cpp b/clang/test/AST/ast-dump-templates.cpp
index d25ef36dd4d3..9fcafbcbcc46 100644
--- a/clang/test/AST/ast-dump-templates.cpp
+++ b/clang/test/AST/ast-dump-templates.cpp
@@ -104,3 +104,17 @@ void (*q)() = f<>;
 // CHECK1: template<> void f<0L>()
 // CHECK1: template<> void f<0U>()
 }
+
+namespace test6 {
+template <class D>
+constexpr bool C = true;
+
+template <class Key>
+void func() {
+  C<Key>;
+// DUMP:      UnresolvedLookupExpr {{.*}} '<dependent type>' lvalue (no ADL) = 'C'
+// DUMP-NEXT: `-TemplateArgument type 'Key'
+// DUMP-NEXT:   `-TemplateTypeParmType {{.*}} 'Key' dependent depth 0 index 0
+// DUMP-NEXT:     `-TemplateTypeParm {{.*}} 'Key'
+}
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/call-args-regression-traverse-decl-crash.cpp b/clang/test/Analysis/Checkers/WebKit/call-args-regression-traverse-decl-crash.cpp
new file mode 100644
index 000000000000..3d8e822025f6
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/call-args-regression-traverse-decl-crash.cpp
@@ -0,0 +1,7 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s
+// expected-no-diagnostics
+
+template <class Class> struct T;
+template <template <class> class Class, class Type>
+struct T<Class<Type>>
+{ };
diff --git a/clang/test/Analysis/Checkers/WebKit/call-args.cpp b/clang/test/Analysis/Checkers/WebKit/call-args.cpp
index f2e1f9bc5a24..2a4b6bb1f106 100644
--- a/clang/test/Analysis/Checkers/WebKit/call-args.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/call-args.cpp
@@ -32,7 +32,7 @@ namespace ref_counted {
   void consume_ref_counted(Ref<RefCountable>) {}
 
   void foo() {
-    consume_refcntbl(provide_ref_counted().get());
+    consume_refcntbl(provide_ref_counted().ptr());
     // no warning
   }
 }
diff --git a/clang/test/Analysis/Checkers/WebKit/mock-types.h b/clang/test/Analysis/Checkers/WebKit/mock-types.h
index aab99197dfa4..c27ea9baaf3b 100644
--- a/clang/test/Analysis/Checkers/WebKit/mock-types.h
+++ b/clang/test/Analysis/Checkers/WebKit/mock-types.h
@@ -1,24 +1,61 @@
 #ifndef mock_types_1103988513531
 #define mock_types_1103988513531
 
-template <typename T> struct Ref {
-  T *t;
+template<typename T>
+struct RawPtrTraits {
+  using StorageType = T*;
 
-  Ref() : t{} {};
-  Ref(T &t)
-    : t(t) {
-    if (t)
-      t->ref();
+  template<typename U>
+  static T* exchange(StorageType& ptr, U&& newValue)
+  {
+    StorageType oldValue = static_cast<StorageType&&>(ptr);
+    ptr = static_cast<U&&>(newValue);
+    return oldValue;
   }
-  ~Ref() {
-    if (t)
-      t->deref();
+
+  static void swap(StorageType& a, StorageType& b)
+  {
+    StorageType temp = static_cast<StorageType&&>(a);
+    a = static_cast<StorageType&&>(b);
+    b = static_cast<StorageType&&>(temp);
   }
-  T *get() { return t; }
-  T *ptr() { return t; }
-  T *operator->() { return t; }
-  operator const T &() const { return *t; }
-  operator T &() { return *t; }
+  static T* unwrap(const StorageType& ptr) { return ptr; }
+};
+
+template<typename T> struct DefaultRefDerefTraits {
+  static T* refIfNotNull(T* ptr)
+  {
+    if (ptr)
+      ptr->ref();
+    return ptr;
+  }
+
+  static T& ref(T& ref)
+  {
+    ref.ref();
+    return ref;
+  }
+
+  static void derefIfNotNull(T* ptr)
+  {
+    if (ptr)
+      ptr->deref();
+  }
+};
+
+template <typename T, typename PtrTraits = RawPtrTraits<T>, typename RefDerefTraits = DefaultRefDerefTraits<T>> struct Ref {
+  typename PtrTraits::StorageType t;
+
+  Ref() : t{} {};
+  Ref(T &t) : t(RefDerefTraits::refIfNotNull(t)) { }
+  Ref(const Ref& o) : t(RefDerefTraits::refIfNotNull(PtrTraits::unwrap(o.t))) { }
+  ~Ref() { RefDerefTraits::derefIfNotNull(PtrTraits::exchange(t, nullptr)); }
+  T &get() { return *PtrTraits::unwrap(t); }
+  T *ptr() { return PtrTraits::unwrap(t); }
+  T *operator->() { return PtrTraits::unwrap(t); }
+  operator const T &() const { return *PtrTraits::unwrap(t); }
+  operator T &() { return *PtrTraits::unwrap(t); }
+  T* leakRef() { PtrTraits::exchange(t, nullptr); }
 };
 
 template <typename T> struct RefPtr {
diff --git a/clang/test/Analysis/html_diagnostics/relevant_lines/multifile.c b/clang/test/Analysis/html_diagnostics/relevant_lines/multifile.c
index 3abffd609b5b..1998c9383d9d 100644
--- a/clang/test/Analysis/html_diagnostics/relevant_lines/multifile.c
+++ b/clang/test/Analysis/html_diagnostics/relevant_lines/multifile.c
@@ -11,4 +11,4 @@ int f(int coin) {
 // RUN: rm -rf %t.output
 // RUN: %clang_analyze_cc1 -analyze -analyzer-checker=core -analyzer-output html -o %t.output %s
 // RUN: cat %t.output/* | FileCheck %s --match-full-lines
-// CHECK: var relevant_lines = {"1": {"3": 1, "4": 1, "5": 1, "6": 1}, "3": {"3": 1, "4": 1, "5": 1, "6": 1, "7": 1}};
+// CHECK: var relevant_lines = {"1": {"3": 1, "4": 1, "5": 1, "6": 1}, "4": {"3": 1, "4": 1, "5": 1, "6": 1, "7": 1}};
diff --git a/clang/test/CXX/drs/cwg2149.cpp b/clang/test/CXX/drs/cwg2149.cpp
new file mode 100644
index 000000000000..8416e42cbd69
--- /dev/null
+++ b/clang/test/CXX/drs/cwg2149.cpp
@@ -0,0 +1,77 @@
+// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify=expected,cxx98 -fexceptions -fcxx-exceptions -pedantic-errors -ast-dump | FileCheck %s --check-prefixes CXX98
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+
+#if __cplusplus == 199711L
+#define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
+// cxx98-error@-1 {{variadic macros are a C99 feature}}
+#endif
+
+namespace cwg2149 { // cwg2149: 3.1
+#if __cplusplus <= 201103L
+struct X { int i, j, k; };
+#else
+struct X { int i, j, k = 42; };
+#endif
+
+template<int N> 
+void f1(const X(&)[N]); // #cwg2149-f1
+
+template<int N>
+void f2(const X(&)[N][2]); // #cwg2149-f2
+
+void f() {
+  X a[] = { 1, 2, 3, 4, 5, 6 };
+  static_assert(sizeof(a) / sizeof(X) == 2, "");
+  X b[2] = { { 1, 2, 3 }, { 4, 5, 6 } };
+  X c[][2] = { 1, 2, 3, 4, 5, 6 };
+  static_assert(sizeof(c) / sizeof(X[2]) == 1, "");
+  
+  #if __cplusplus >= 201103L
+  constexpr X ca[] = { 1, 2, 3, 4, 5, 6 };
+  constexpr X cb[2] = { { 1, 2, 3 }, { 4, 5, 6 } };
+  static_assert(ca[0].i == cb[0].i, "");
+  static_assert(ca[0].j == cb[0].j, "");
+  static_assert(ca[0].k == cb[0].k, "");
+  static_assert(ca[1].i == cb[1].i, "");
+  static_assert(ca[1].j == cb[1].j, "");
+  static_assert(ca[1].k == cb[1].k, "");
+
+  f1({ 1, 2, 3, 4, 5, 6 });
+  // since-cxx11-error@-1 {{no matching function for call to 'f1'}}
+  //   since-cxx11-note@#cwg2149-f1 {{candidate function [with N = 6] not viable: no known conversion from 'int' to 'const X' for 1st argument}}
+  f2({ 1, 2, 3, 4, 5, 6 });
+  // since-cxx11-error@-1 {{no matching function for call to 'f2'}}
+  //   since-cxx11-note@#cwg2149-f2 {{candidate function [with N = 6] not viable: no known conversion from 'int' to 'const X[2]' for 1st argument}}
+  #endif
+}
+} // namespace cwg2149
+
+// Constant evaluation is not powerful enough in 98 mode to check for equality
+// via static_assert, even with constant folding enabled.
+
+// CXX98:       VarDecl {{.+}} a 'X[2]'
+// CXX98-NEXT:  `-InitListExpr {{.+}} 'X[2]'
+// CXX98-NEXT:    |-InitListExpr {{.+}} 'X':'cwg2149::X'
+// CXX98-NEXT:    | |-IntegerLiteral {{.+}} 'int' 1
+// CXX98-NEXT:    | |-IntegerLiteral {{.+}} 'int' 2
+// CXX98-NEXT:    | `-IntegerLiteral {{.+}} 'int' 3
+// CXX98-NEXT:    `-InitListExpr {{.+}} 'X':'cwg2149::X'
+// CXX98-NEXT:      |-IntegerLiteral {{.+}} 'int' 4
+// CXX98-NEXT:      |-IntegerLiteral {{.+}} 'int' 5
+// CXX98-NEXT:      `-IntegerLiteral {{.+}} 'int' 6
+
+// CXX98:       VarDecl {{.+}} b 'X[2]'
+// CXX98-NEXT:  `-InitListExpr {{.+}} 'X[2]'
+// CXX98-NEXT:    |-InitListExpr {{.+}} 'X':'cwg2149::X'
+// CXX98-NEXT:    | |-IntegerLiteral {{.+}} 'int' 1
+// CXX98-NEXT:    | |-IntegerLiteral {{.+}} 'int' 2
+// CXX98-NEXT:    | `-IntegerLiteral {{.+}} 'int' 3
+// CXX98-NEXT:    `-InitListExpr {{.+}} 'X':'cwg2149::X'
+// CXX98-NEXT:      |-IntegerLiteral {{.+}} 'int' 4
+// CXX98-NEXT:      |-IntegerLiteral {{.+}} 'int' 5
+// CXX98-NEXT:      `-IntegerLiteral {{.+}} 'int' 6
diff --git a/clang/test/CXX/drs/cwg650.cpp b/clang/test/CXX/drs/cwg650.cpp
index dcb844095b05..33ea179986e3 100644
--- a/clang/test/CXX/drs/cwg650.cpp
+++ b/clang/test/CXX/drs/cwg650.cpp
@@ -4,7 +4,7 @@
 // RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
 // RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
 // RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
-// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// We aren't testing this since C++26 because of P2748R5 "Disallow Binding a Returned Glvalue to a Temporary".
 
 #if __cplusplus == 199711L
 #define NOTHROW throw()
diff --git a/clang/test/CXX/drs/dr20xx.cpp b/clang/test/CXX/drs/dr20xx.cpp
index 291a77e0cc71..9797097acce7 100644
--- a/clang/test/CXX/drs/dr20xx.cpp
+++ b/clang/test/CXX/drs/dr20xx.cpp
@@ -90,7 +90,7 @@ namespace cwg2026 { // cwg2026: 11
   }
 }
 
-namespace cwg2049 { // cwg2049: 18 drafting P2308R1
+namespace cwg2049 { // cwg2049: 18
 #if __cplusplus >= 202302L
 template <int* x = {}> struct X {};
 X<> a;
diff --git a/clang/test/CXX/drs/dr21xx.cpp b/clang/test/CXX/drs/dr21xx.cpp
index 4fab10c279aa..082deb42e4fa 100644
--- a/clang/test/CXX/drs/dr21xx.cpp
+++ b/clang/test/CXX/drs/dr21xx.cpp
@@ -175,6 +175,8 @@ void foo() {
 }
 }
 
+// cwg2149 is in cwg2149.cpp
+
 namespace cwg2157 { // cwg2157: 11
 #if __cplusplus >= 201103L
   enum E : int;
diff --git a/clang/test/CXX/drs/dr24xx.cpp b/clang/test/CXX/drs/dr24xx.cpp
index 5ffaebda68c1..9f876cd87083 100644
--- a/clang/test/CXX/drs/dr24xx.cpp
+++ b/clang/test/CXX/drs/dr24xx.cpp
@@ -45,7 +45,7 @@ void fallthrough(int n) {
 #endif
 }
 
-namespace cwg2450 { // cwg2450: 18 review P2308R1
+namespace cwg2450 { // cwg2450: 18
 #if __cplusplus >= 202302L
 struct S {int a;};
 template <S s>
@@ -59,7 +59,7 @@ f<{.a= 0}>();
 #endif
 }
 
-namespace cwg2459 { // cwg2459: 18 drafting P2308R1
+namespace cwg2459 { // cwg2459: 18
 #if __cplusplus >= 202302L
 struct A {
   constexpr A(float) {}
diff --git a/clang/test/CXX/drs/dr25xx.cpp b/clang/test/CXX/drs/dr25xx.cpp
index 62b2a0a088cc..8bca58f44944 100644
--- a/clang/test/CXX/drs/dr25xx.cpp
+++ b/clang/test/CXX/drs/dr25xx.cpp
@@ -130,12 +130,14 @@ struct D3 : B {
 #endif
 
 #if __cplusplus >= 202302L
-namespace cwg2561 { // cwg2561: 18 review 2023-11-09
+namespace cwg2561 { // cwg2561: no tentatively ready 2024-03-18
 struct C {
     constexpr C(auto) { }
 };
 void foo() {
     constexpr auto b = [](this C) { return 1; };
+    // FIXME: closure type shouldn't have a conversion function to function
+    //        pointer, because explicit object parameter is present. 
     constexpr int (*fp)(C) = b;
     static_assert(fp(1) == 1);
     static_assert((&decltype(b)::operator())(1) == 1);
diff --git a/clang/test/CXX/drs/dr28xx.cpp b/clang/test/CXX/drs/dr28xx.cpp
index 4d9b0c76758d..be35d366bdd6 100644
--- a/clang/test/CXX/drs/dr28xx.cpp
+++ b/clang/test/CXX/drs/dr28xx.cpp
@@ -10,7 +10,15 @@
 // expected-no-diagnostics
 #endif
 
-namespace cwg2847 { // cwg2847: 19
+namespace cwg2819 { // cwg2819: 19 tentatively ready 2023-12-01
+#if __cpp_constexpr >= 202306L
+  constexpr void* p = nullptr;
+  constexpr int* q = static_cast<int*>(p);
+  static_assert(q == nullptr);
+#endif
+}
+
+namespace cwg2847 { // cwg2847: 19 review 2024-03-01
 
 #if __cplusplus >= 202002L
 
@@ -59,7 +67,7 @@ void B<int>::g() requires true;
 
 } // namespace cwg2847
 
-namespace cwg2858 { // cwg2858: 19
+namespace cwg2858 { // cwg2858: 19 tentatively ready 2024-04-05
 
 #if __cplusplus > 202302L
 
diff --git a/clang/test/CXX/drs/dr2xx.cpp b/clang/test/CXX/drs/dr2xx.cpp
index 2b3131be3305..5d3e8ce4bea3 100644
--- a/clang/test/CXX/drs/dr2xx.cpp
+++ b/clang/test/CXX/drs/dr2xx.cpp
@@ -561,9 +561,9 @@ namespace cwg244 { // cwg244: 11
     B_ptr->B_alias::~B();
     B_ptr->B_alias::~B_alias();
     B_ptr->cwg244::~B();
-    // expected-error@-1 {{no member named '~B' in namespace 'cwg244'}}
+    // expected-error@-1 {{qualified member access refers to a member in namespace 'cwg244'}}
     B_ptr->cwg244::~B_alias();
-    // expected-error@-1 {{no member named '~B' in namespace 'cwg244'}}
+    // expected-error@-1 {{qualified member access refers to a member in namespace 'cwg244'}}
   }
 
   template<typename T, typename U>
@@ -836,7 +836,7 @@ namespace cwg258 { // cwg258: 2.8
 
 namespace cwg259 { // cwg259: 4
   template<typename T> struct A {};
-  template struct A<int>; // #cwg259-A-int
+  template struct A<int>; // #cwg259-A-int 
   template struct A<int>;
   // expected-error@-1 {{duplicate explicit instantiation of 'A<int>'}}
   //   expected-note@#cwg259-A-int {{previous explicit instantiation is here}}
@@ -997,7 +997,7 @@ namespace cwg275 { // cwg275: no
     // expected-error@-1 {{no function template matches function template specialization 'f'}}
   }
 
-  template <class T> void g(T) {} // #cwg275-g
+  template <class T> void g(T) {} // #cwg275-g 
 
   template <> void N::f(char) {}
   template <> void f(int) {}
@@ -1164,7 +1164,7 @@ namespace cwg285 { // cwg285: yes
 namespace cwg286 { // cwg286: 2.8
   template<class T> struct A {
     class C {
-      template<class T2> struct B {}; // #cwg286-B
+      template<class T2> struct B {}; // #cwg286-B 
     };
   };
 
diff --git a/clang/test/CXX/drs/dr3xx.cpp b/clang/test/CXX/drs/dr3xx.cpp
index 94227dc031c6..3e9228fe21fb 100644
--- a/clang/test/CXX/drs/dr3xx.cpp
+++ b/clang/test/CXX/drs/dr3xx.cpp
@@ -34,7 +34,7 @@ namespace cwg301 { // cwg301: 3.5
     bool b = (void(*)(S, S))operator- < (void(*)(S, S))operator-;
     // cxx98-17-warning@-1 {{ordered comparison of function pointers ('void (*)(S, S)' and 'void (*)(S, S)')}}
     // cxx20-23-error@-2 {{expected '>'}}
-    //   cxx20-23-note@-3 {{to match this '<'}}
+    //   cxx20-23-note@-3 {{to match this '<'}} 
     bool c = (void(*)(S, S))operator+ < (void(*)(S, S))operator-;
     // expected-error@-1 {{expected '>'}}
     //   expected-note@-2 {{to match this '<'}}
@@ -642,7 +642,7 @@ namespace cwg339 { // cwg339: 2.8
   char xxx(int);
   char (&xxx(float))[2];
 
-  template<class T> A<sizeof(xxx((T)0))> f(T) {} // #cwg339-f
+  template<class T> A<sizeof(xxx((T)0))> f(T) {} // #cwg339-f 
 
   void test() {
     A<1> a = f(0);
@@ -828,7 +828,7 @@ namespace cwg352 { // cwg352: 2.8
     void g(A::E e) {
       foo(e, &arg);
       // expected-error@-1 {{no matching function for call to 'foo'}}
-      //   expected-note@#cwg352-foo {{candidate template ignored: couldn't infer template argument 'R'}}
+      //   expected-note@#cwg352-foo {{candidate template ignored: couldn't infer template argument 'R'}} 
 
       using A::foo;
       foo<int, int>(e, &arg); // ok, uses non-template
@@ -929,7 +929,7 @@ namespace cwg352 { // cwg352: 2.8
 
   namespace example5 {
     template<int I> class A {};
-    template<int I> void g(A<I+1>); // #cwg352-g
+    template<int I> void g(A<I+1>); // #cwg352-g 
     template<int I> void f(A<I>, A<I+1>);
     void h(A<1> a1, A<2> a2) {
       g(a1);
@@ -1256,7 +1256,7 @@ namespace cwg373 { // cwg373: 5
     }
   };
 
-  struct A { struct B {}; }; // #cwg373-A
+  struct A { struct B {}; }; // #cwg373-A 
   namespace X = A::B;
   // expected-error@-1 {{expected namespace name}}
   //   expected-note@#cwg373-A {{'A' declared here}}
@@ -1608,7 +1608,7 @@ namespace cwg395 { // cwg395: 3.0
     // expected-error@-2 {{conversion function cannot have any parameters}}
     // expected-error@-3 {{cannot specify any part of a return type in the declaration of a conversion function}}
     // expected-error@-4 {{conversion function cannot convert to a function type}}
-
+  
   };
 
   struct null1_t {
@@ -1721,9 +1721,9 @@ namespace cwg399 { // cwg399: 11
     B_ptr->B_alias::~B();
     B_ptr->B_alias::~B_alias();
     B_ptr->cwg399::~B();
-    // expected-error@-1 {{no member named '~B' in namespace 'cwg399'}}
+    // expected-error@-1 {{qualified member access refers to a member in namespace 'cwg399'}}
     B_ptr->cwg399::~B_alias();
-    // expected-error@-1 {{no member named '~B' in namespace 'cwg399'}}
+    // expected-error@-1 {{qualified member access refers to a member in namespace 'cwg399'}}
   }
 
   template<typename T, typename U>
diff --git a/clang/test/CXX/expr/expr.const/p5-26.cpp b/clang/test/CXX/expr/expr.const/p5-26.cpp
index 3624b1e5a3e3..7513b11c09aa 100644
--- a/clang/test/CXX/expr/expr.const/p5-26.cpp
+++ b/clang/test/CXX/expr/expr.const/p5-26.cpp
@@ -37,3 +37,10 @@ void err() {
                                                 // cxx23-note {{cast from 'void *' is not allowed in a constant expression in C++ standards before C++2c}} \
                                                 // cxx26-note {{cast from 'void *' is not allowed in a constant expression because the pointed object type 'T' is not similar to the target type 'S'}}
 }
+
+int* p;
+constexpr int** pp = &p;
+constexpr void* vp = pp;
+constexpr auto cvp = static_cast<const int* volatile*>(vp);
+// cxx23-error@-1 {{constant expression}}
+// cxx23-note@-2 {{cast from 'void *' is not allowed in a constant expression}}
diff --git a/clang/test/CXX/stmt.stmt/stmt.return/p6.cpp b/clang/test/CXX/stmt.stmt/stmt.return/p6.cpp
new file mode 100644
index 000000000000..c192b0c8112a
--- /dev/null
+++ b/clang/test/CXX/stmt.stmt/stmt.return/p6.cpp
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -std=c++26 -fsyntax-only -verify %s
+
+auto&& f1() {
+  return 42; // expected-error{{returning reference to local temporary object}}
+}
+const double& f2() {
+  static int x = 42;
+  return x; // expected-error{{returning reference to local temporary object}}
+}
+auto&& id(auto&& r) {
+  return static_cast<decltype(r)&&>(r);
+}
+auto&& f3() {
+  return id(42);        // OK, but probably a bug
+}
+
+void unevaluated() {
+  using a = decltype ([] () -> const int & {
+    const int &i = 0; // expected-note {{binding reference variable 'i' here}}
+    return i; // expected-error{{returning reference to local temporary object}}
+} ());
+}
+
+static_assert(__is_convertible(int, const int &));
+static_assert(__is_nothrow_convertible(int, const int &));
diff --git a/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p4.cpp b/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p4.cpp
deleted file mode 100644
index b1d2859be863..000000000000
--- a/clang/test/CXX/temp/temp.res/temp.dep/temp.dep.type/p4.cpp
+++ /dev/null
@@ -1,456 +0,0 @@
-// RUN: %clang_cc1 -Wno-unused-value -verify %s
-
-namespace N0 {
-  struct A {
-    int x0;
-    static int y0;
-    int x1;
-    static int y1;
-
-    void f0();
-    static void g0();
-    void f1();
-    static void g1();
-
-    using M0 = int;
-    using M1 = int;
-
-    struct C0 { };
-    struct C1 { };
-  };
-
-  template<typename T>
-  struct B : A {
-    int x2;
-    static int y2;
-
-    void f2();
-    static void g2();
-
-    using M2 = int;
-
-    struct C2 { };
-
-    using A::x1;
-    using A::y1;
-    using A::f1;
-    using A::g1;
-    using A::M1;
-    using A::C1;
-
-    using T::x3;
-    using T::y3;
-    using T::f3;
-    using T::g3;
-    using typename T::M3;
-    using typename T::C3;
-
-    void not_instantiated(B *a, B &b) {
-      // All of the following should be found in the current instantiation.
-
-      new M0;
-      new B::M0;
-      new A::M0;
-      new B::A::M0;
-      new C0;
-      new B::C0;
-      new A::C0;
-      new B::A::C0;
-      new M1;
-      new B::M1;
-      new A::M1;
-      new B::A::M1;
-      new C1;
-      new B::C1;
-      new A::C1;
-      new B::A::C1;
-      new M2;
-      new B::M2;
-      new C2;
-      new B::C2;
-      new M3;
-      new B::M3;
-      new C3;
-      new B::C3;
-
-      x0;
-      B::x0;
-      A::x0;
-      B::A::x0;
-      y0;
-      B::y0;
-      A::y0;
-      B::A::y0;
-      x1;
-      B::x1;
-      A::x1;
-      B::A::x1;
-      y1;
-      B::y1;
-      A::y1;
-      B::A::y1;
-      x2;
-      B::x2;
-      y2;
-      B::y2;
-      x3;
-      B::x3;
-      y3;
-      B::y3;
-
-      f0();
-      B::f0();
-      A::f0();
-      B::A::f0();
-      g0();
-      B::g0();
-      A::g0();
-      B::A::g0();
-      f1();
-      B::f1();
-      A::f1();
-      B::A::f1();
-      g1();
-      B::g1();
-      A::g1();
-      B::A::g1();
-      f2();
-      B::f2();
-      g2();
-      B::g2();
-      f3();
-      B::f3();
-      g3();
-      B::g3();
-
-      this->x0;
-      this->B::x0;
-      this->A::x0;
-      this->B::A::x0;
-      this->y0;
-      this->B::y0;
-      this->A::y0;
-      this->B::A::y0;
-      this->x1;
-      this->B::x1;
-      this->A::x1;
-      this->B::A::x1;
-      this->y1;
-      this->B::y1;
-      this->A::y1;
-      this->B::A::y1;
-      this->x2;
-      this->B::x2;
-      this->y2;
-      this->B::y2;
-      this->x3;
-      this->B::x3;
-      this->y3;
-      this->B::y3;
-
-      this->f0();
-      this->B::f0();
-      this->A::f0();
-      this->B::A::f0();
-      this->g0();
-      this->B::g0();
-      this->A::g0();
-      this->B::A::g0();
-      this->f1();
-      this->B::f1();
-      this->A::f1();
-      this->B::A::f1();
-      this->g1();
-      this->B::g1();
-      this->A::g1();
-      this->B::A::g1();
-      this->f2();
-      this->B::f2();
-      this->g2();
-      this->B::g2();
-      this->f3();
-      this->B::f3();
-      this->g3();
-      this->B::g3();
-
-      a->x0;
-      a->B::x0;
-      a->A::x0;
-      a->B::A::x0;
-      a->y0;
-      a->B::y0;
-      a->A::y0;
-      a->B::A::y0;
-      a->x1;
-      a->B::x1;
-      a->A::x1;
-      a->B::A::x1;
-      a->y1;
-      a->B::y1;
-      a->A::y1;
-      a->B::A::y1;
-      a->x2;
-      a->B::x2;
-      a->y2;
-      a->B::y2;
-      a->x3;
-      a->B::x3;
-      a->y3;
-      a->B::y3;
-
-      a->f0();
-      a->B::f0();
-      a->A::f0();
-      a->B::A::f0();
-      a->g0();
-      a->B::g0();
-      a->A::g0();
-      a->B::A::g0();
-      a->f1();
-      a->B::f1();
-      a->A::f1();
-      a->B::A::f1();
-      a->g1();
-      a->B::g1();
-      a->A::g1();
-      a->B::A::g1();
-      a->f2();
-      a->B::f2();
-      a->g2();
-      a->B::g2();
-      a->f3();
-      a->B::f3();
-      a->g3();
-      a->B::g3();
-
-      (*this).x0;
-      (*this).B::x0;
-      (*this).A::x0;
-      (*this).B::A::x0;
-      (*this).y0;
-      (*this).B::y0;
-      (*this).A::y0;
-      (*this).B::A::y0;
-      (*this).x1;
-      (*this).B::x1;
-      (*this).A::x1;
-      (*this).B::A::x1;
-      (*this).y1;
-      (*this).B::y1;
-      (*this).A::y1;
-      (*this).B::A::y1;
-      (*this).x2;
-      (*this).B::x2;
-      (*this).y2;
-      (*this).B::y2;
-      (*this).x3;
-      (*this).B::x3;
-      (*this).y3;
-      (*this).B::y3;
-
-      (*this).f0();
-      (*this).B::f0();
-      (*this).A::f0();
-      (*this).B::A::f0();
-      (*this).g0();
-      (*this).B::g0();
-      (*this).A::g0();
-      (*this).B::A::g0();
-      (*this).f1();
-      (*this).B::f1();
-      (*this).A::f1();
-      (*this).B::A::f1();
-      (*this).g1();
-      (*this).B::g1();
-      (*this).A::g1();
-      (*this).B::A::g1();
-      (*this).f2();
-      (*this).B::f2();
-      (*this).g2();
-      (*this).B::g2();
-      (*this).f3();
-      (*this).B::f3();
-      (*this).g3();
-      (*this).B::g3();
-
-      b.x0;
-      b.B::x0;
-      b.A::x0;
-      b.B::A::x0;
-      b.y0;
-      b.B::y0;
-      b.A::y0;
-      b.B::A::y0;
-      b.x1;
-      b.B::x1;
-      b.A::x1;
-      b.B::A::x1;
-      b.y1;
-      b.B::y1;
-      b.A::y1;
-      b.B::A::y1;
-      b.x2;
-      b.B::x2;
-      b.y2;
-      b.B::y2;
-      b.x3;
-      b.B::x3;
-      b.y3;
-      b.B::y3;
-
-      b.f0();
-      b.B::f0();
-      b.A::f0();
-      b.B::A::f0();
-      b.g0();
-      b.B::g0();
-      b.A::g0();
-      b.B::A::g0();
-      b.f1();
-      b.B::f1();
-      b.A::f1();
-      b.B::A::f1();
-      b.g1();
-      b.B::g1();
-      b.A::g1();
-      b.B::A::g1();
-      b.f2();
-      b.B::f2();
-      b.g2();
-      b.B::g2();
-      b.f3();
-      b.B::f3();
-      b.g3();
-      b.B::g3();
-
-      // None of the following should be found in the current instantiation.
-
-      new M4; // expected-error{{unknown type name 'M4'}}
-      new B::M4; // expected-error{{no type named 'M4' in 'B<T>'}}
-      new A::M4; // expected-error{{no type named 'M4' in 'N0::A'}}
-      new B::A::M4; // expected-error{{no type named 'M4' in 'N0::A'}}
-
-      x4; // expected-error{{use of undeclared identifier 'x4'}}
-      B::x4; // expected-error{{no member named 'x4' in 'B<T>'}}
-      A::x4; // expected-error{{no member named 'x4' in 'N0::A'}}
-      B::A::x4; // expected-error{{no member named 'x4' in 'N0::A'}}
-      f4(); // expected-error{{use of undeclared identifier 'f4'}}
-      B::f4(); // expected-error{{no member named 'f4' in 'B<T>'}}
-      A::f4(); // expected-error{{no member named 'f4' in 'N0::A'}}
-      B::A::f4(); // expected-error{{no member named 'f4' in 'N0::A'}}
-
-      this->x4; // expected-error{{no member named 'x4' in 'B<T>'}}
-      this->B::x4; // expected-error{{no member named 'x4' in 'B<T>'}}
-      this->A::x4; // expected-error{{no member named 'x4' in 'N0::A'}}
-      this->B::A::x4; // expected-error{{no member named 'x4' in 'N0::A'}}
-      this->f4(); // expected-error{{no member named 'f4' in 'B<T>'}}
-      this->B::f4(); // expected-error{{no member named 'f4' in 'B<T>'}}
-      this->A::f4(); // expected-error{{no member named 'f4' in 'N0::A'}}
-      this->B::A::f4(); // expected-error{{no member named 'f4' in 'N0::A'}}
-
-      a->x4; // expected-error{{no member named 'x4' in 'B<T>'}}
-      a->B::x4; // expected-error{{no member named 'x4' in 'B<T>'}}
-      a->A::x4; // expected-error{{no member named 'x4' in 'N0::A'}}
-      a->B::A::x4; // expected-error{{no member named 'x4' in 'N0::A'}}
-      a->f4(); // expected-error{{no member named 'f4' in 'B<T>'}}
-      a->B::f4(); // expected-error{{no member named 'f4' in 'B<T>'}}
-      a->A::f4(); // expected-error{{no member named 'f4' in 'N0::A'}}
-      a->B::A::f4(); // expected-error{{no member named 'f4' in 'N0::A'}}
-
-      // FIXME: An overloaded unary 'operator*' is built for these
-      // even though the operand is a pointer (to a dependent type).
-      // Type::isOverloadableType should return false for such cases.
-      (*this).x4;
-      (*this).B::x4;
-      (*this).A::x4;
-      (*this).B::A::x4;
-      (*this).f4();
-      (*this).B::f4();
-      (*this).A::f4();
-      (*this).B::A::f4();
-
-      b.x4; // expected-error{{no member named 'x4' in 'B<T>'}}
-      b.B::x4; // expected-error{{no member named 'x4' in 'B<T>'}}
-      b.A::x4; // expected-error{{no member named 'x4' in 'N0::A'}}
-      b.B::A::x4; // expected-error{{no member named 'x4' in 'N0::A'}}
-      b.f4(); // expected-error{{no member named 'f4' in 'B<T>'}}
-      b.B::f4(); // expected-error{{no member named 'f4' in 'B<T>'}}
-      b.A::f4(); // expected-error{{no member named 'f4' in 'N0::A'}}
-      b.B::A::f4(); // expected-error{{no member named 'f4' in 'N0::A'}}
-    }
-  };
-} // namespace N0
-
-namespace N1 {
-  struct A {
-    template<int I>
-    void f();
-  };
-
-  template<typename T>
-  struct B {
-    template<int I>
-    void f();
-
-    A x;
-    A g();
-
-    void not_instantiated(B *a, B &b) {
-      f<0>();
-      this->f<0>();
-      a->f<0>();
-      // FIXME: This should not require 'template'!
-      (*this).f<0>(); // expected-error{{missing 'template' keyword prior to dependent template name 'f'}}
-      b.f<0>();
-
-      x.f<0>();
-      this->x.f<0>();
-      a->x.f<0>();
-      // FIXME: This should not require 'template'!
-      (*this).x.f<0>(); // expected-error{{missing 'template' keyword prior to dependent template name 'f'}}
-      b.x.f<0>();
-
-      // FIXME: None of these should require 'template'!
-      g().f<0>(); // expected-error{{missing 'template' keyword prior to dependent template name 'f'}}
-      this->g().f<0>(); // expected-error{{missing 'template' keyword prior to dependent template name 'f'}}
-      a->g().f<0>(); // expected-error{{missing 'template' keyword prior to dependent template name 'f'}}
-      (*this).g().f<0>(); // expected-error{{missing 'template' keyword prior to dependent template name 'f'}}
-      b.g().f<0>(); // expected-error{{missing 'template' keyword prior to dependent template name 'f'}}
-    }
-  };
-} // namespace N1
-
-namespace N2 {
-  template<typename T>
-  struct A {
-    struct B {
-      using C = A;
-
-      void not_instantiated(A *a, B *b) {
-        b->x; // expected-error{{no member named 'x' in 'N2::A::B'}}
-        b->B::x; // expected-error{{no member named 'x' in 'N2::A::B'}}
-        a->B::C::x; // expected-error{{no member named 'x' in 'A<T>'}}
-      }
-    };
-
-    void not_instantiated(A *a, B *b) {
-      b->x;
-      b->B::x;
-      a->B::C::x;
-    }
-  };
-}
-
-namespace N3 {
-  struct A { };
-
-  template<typename T>
-  struct B : A {
-    void not_instantiated() {
-      // Dependent, lookup context is the current instantiation.
-      this->operator=(*this);
-      // Not dependent, the lookup context is A (not the current instantiation).
-      this->A::operator=(*this);
-    }
-  };
-}
diff --git a/clang/test/CXX/temp/temp.res/temp.local/p3.cpp b/clang/test/CXX/temp/temp.res/temp.local/p3.cpp
index b9b29d22736e..87589e1e5bcd 100644
--- a/clang/test/CXX/temp/temp.res/temp.local/p3.cpp
+++ b/clang/test/CXX/temp/temp.res/temp.local/p3.cpp
@@ -16,7 +16,8 @@ template <class T> struct Derived: Base<int>, Base<char> {
   void g(X0 *t) {
     t->Derived::Base<T>::f();
     t->Base<T>::f();
-    t->Base::f(); // expected-error{{member 'Base' found in multiple base classes of different types}}
+    t->Base::f(); // expected-error{{member 'Base' found in multiple base classes of different types}} \
+    // expected-error{{no member named 'f' in 'X0'}}
   }
 };
 
diff --git a/clang/test/CodeGen/X86/ms-x86-intrinsics.c b/clang/test/CodeGen/X86/ms-x86-intrinsics.c
index a1c90d71c8eb..aa557c8e19a8 100644
--- a/clang/test/CodeGen/X86/ms-x86-intrinsics.c
+++ b/clang/test/CodeGen/X86/ms-x86-intrinsics.c
@@ -48,7 +48,7 @@ long long test__readfsqword(unsigned long Offset) {
 __int64 test__emul(int a, int b) {
   return __emul(a, b);
 }
-// CHECK-LABEL: define dso_local i64 @test__emul(i32 noundef %a, i32 noundef %b)
+// CHECK-LABEL: define dso_local range(i64 -4611686016279904256, 4611686018427387905) i64 @test__emul(i32 noundef %a, i32 noundef %b)
 // CHECK: [[X:%[0-9]+]] = sext i32 %a to i64
 // CHECK: [[Y:%[0-9]+]] = sext i32 %b to i64
 // CHECK: [[RES:%[0-9]+]] = mul nsw i64 [[Y]], [[X]]
@@ -57,7 +57,7 @@ __int64 test__emul(int a, int b) {
 unsigned __int64 test__emulu(unsigned int a, unsigned int b) {
   return __emulu(a, b);
 }
-// CHECK-LABEL: define dso_local i64 @test__emulu(i32 noundef %a, i32 noundef %b)
+// CHECK-LABEL: define dso_local range(i64 0, -8589934590) i64 @test__emulu(i32 noundef %a, i32 noundef %b)
 // CHECK: [[X:%[0-9]+]] = zext i32 %a to i64
 // CHECK: [[Y:%[0-9]+]] = zext i32 %b to i64
 // CHECK: [[RES:%[0-9]+]] = mul nuw i64 [[Y]], [[X]]
@@ -108,13 +108,13 @@ long long test__readgsqword(unsigned long Offset) {
 __int64 test__mulh(__int64 a, __int64 b) {
   return __mulh(a, b);
 }
-// CHECK-X64-LABEL: define dso_local i64 @test__mulh(i64 noundef %a, i64 noundef %b)
+// CHECK-X64-LABEL: define dso_local range(i64 -4611686018427387904, 4611686018427387905) i64 @test__mulh(i64 noundef %a, i64 noundef %b)
 // CHECK-X64: = mul nsw i128 %
 
 unsigned __int64 test__umulh(unsigned __int64 a, unsigned __int64 b) {
   return __umulh(a, b);
 }
-// CHECK-X64-LABEL: define dso_local i64 @test__umulh(i64 noundef %a, i64 noundef %b)
+// CHECK-X64-LABEL: define dso_local range(i64 0, -1) i64 @test__umulh(i64 noundef %a, i64 noundef %b)
 // CHECK-X64: = mul nuw i128 %
 
 __int64 test_mul128(__int64 Multiplier,
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fp_reduce.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fp_reduce.c
index e58cf4e49a37..9d5ffdafe866 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fp_reduce.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_fp_reduce.c
@@ -20,13 +20,13 @@
 // CHECK-LABEL: @test_svaddqv_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x half> @llvm.aarch64.sve.addqv.v8f16.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[OP:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x half> @llvm.aarch64.sve.faddqv.v8f16.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[OP:%.*]])
 // CHECK-NEXT:    ret <8 x half> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svaddqv_f16u10__SVBool_tu13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x half> @llvm.aarch64.sve.addqv.v8f16.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[OP:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x half> @llvm.aarch64.sve.faddqv.v8f16.nxv8f16(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <8 x half> [[TMP1]]
 //
 float16x8_t test_svaddqv_f16(svbool_t pg, svfloat16_t op)
@@ -37,13 +37,13 @@ float16x8_t test_svaddqv_f16(svbool_t pg, svfloat16_t op)
 // CHECK-LABEL: @test_svaddqv_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.aarch64.sve.addqv.v4f32.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[OP:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.aarch64.sve.faddqv.v4f32.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[OP:%.*]])
 // CHECK-NEXT:    ret <4 x float> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svaddqv_f32u10__SVBool_tu13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.aarch64.sve.addqv.v4f32.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[OP:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.aarch64.sve.faddqv.v4f32.nxv4f32(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <4 x float> [[TMP1]]
 //
 float32x4_t test_svaddqv_f32(svbool_t pg, svfloat32_t op)
@@ -54,13 +54,13 @@ float32x4_t test_svaddqv_f32(svbool_t pg, svfloat32_t op)
 // CHECK-LABEL: @test_svaddqv_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.aarch64.sve.addqv.v2f64.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[OP:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.aarch64.sve.faddqv.v2f64.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[OP:%.*]])
 // CHECK-NEXT:    ret <2 x double> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svaddqv_f64u10__SVBool_tu13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.aarch64.sve.addqv.v2f64.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[OP:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.aarch64.sve.faddqv.v2f64.nxv2f64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[OP:%.*]])
 // CPP-CHECK-NEXT:    ret <2 x double> [[TMP1]]
 //
 float64x2_t test_svaddqv_f64(svbool_t pg, svfloat64_t op)
diff --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
index f50eaf371028..0f2c5b2546fa 100644
--- a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
@@ -426,11 +426,12 @@ bfloat16_t test_vcvth_bf16_f32(float32_t a) {
 // CHECK-NEXT:    [[__REINT_I:%.*]] = alloca bfloat, align 2
 // CHECK-NEXT:    [[__REINT1_I:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store bfloat [[A:%.*]], ptr [[__REINT_I]], align 2
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__REINT_I]], align 2
-// CHECK-NEXT:    [[SHL_I:%.*]] = shl i32 [[TMP1]], 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[__REINT_I]], align 2
+// CHECK-NEXT:    [[CONV_I:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK-NEXT:    [[SHL_I:%.*]] = shl i32 [[CONV_I]], 16
 // CHECK-NEXT:    store i32 [[SHL_I]], ptr [[__REINT1_I]], align 4
-// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[__REINT1_I]], align 4
-// CHECK-NEXT:    ret float [[TMP3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[__REINT1_I]], align 4
+// CHECK-NEXT:    ret float [[TMP1]]
 //
 float32_t test_vcvtah_f32_bf16(bfloat16_t a) {
   return vcvtah_f32_bf16(a);
diff --git a/clang/test/CodeGen/attr-counted-by.c b/clang/test/CodeGen/attr-counted-by.c
index 1fb39f9a3466..de30a00138ac 100644
--- a/clang/test/CodeGen/attr-counted-by.c
+++ b/clang/test/CodeGen/attr-counted-by.c
@@ -66,7 +66,7 @@ struct anon_struct {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB2:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10:[0-9]+]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10:[0-9]+]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
@@ -114,7 +114,7 @@ void test1(struct annotated *p, int index, int val) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], [[INDEX]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
@@ -158,7 +158,7 @@ void test2(struct annotated *p, size_t index) {
   p->array[index] = __builtin_dynamic_object_size(p->array, 1);
 }
 
-// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test2_bdos(
+// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -8589934592, 8589934589) i64 @test2_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -169,7 +169,7 @@ void test2(struct annotated *p, size_t index) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP3]]
 //
-// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test2_bdos(
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -8589934592, 8589934589) i64 @test2_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -203,7 +203,7 @@ size_t test2_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], [[INDEX]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
@@ -257,7 +257,7 @@ void test3(struct annotated *p, size_t index) {
   p->array[index] = __builtin_dynamic_object_size(p, 1);
 }
 
-// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test3_bdos(
+// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 8589934601) i64 @test3_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -270,7 +270,7 @@ void test3(struct annotated *p, size_t index) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 0
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP5]]
 //
-// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test3_bdos(
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 8589934601) i64 @test3_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -308,7 +308,7 @@ size_t test3_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT4:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB6:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont4:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[DOT_COUNTED_BY_LOAD]], 2
@@ -325,7 +325,7 @@ size_t test3_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[IDXPROM13]], [[TMP6]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP7]], label [[CONT20:%.*]], label [[HANDLER_OUT_OF_BOUNDS16:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds16:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB7:[0-9]+]], i64 [[IDXPROM13]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB6:[0-9]+]], i64 [[IDXPROM13]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont20:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[DOT_COUNTED_BY_LOAD7]], 3
@@ -342,7 +342,7 @@ size_t test3_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP13:%.*]] = icmp ult i64 [[IDXPROM30]], [[TMP12]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP13]], label [[CONT37:%.*]], label [[HANDLER_OUT_OF_BOUNDS33:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds33:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB8:[0-9]+]], i64 [[IDXPROM30]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB7:[0-9]+]], i64 [[IDXPROM30]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont37:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM30]]
@@ -441,7 +441,7 @@ void test4(struct annotated *p, int index, int fam_idx) {
   p->array[index + 2] = (unsigned char)__builtin_dynamic_object_size(&(p->array[fam_idx]), 1);
 }
 
-// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test4_bdos(
+// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -17179869180, 17179869181) i64 @test4_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -456,7 +456,7 @@ void test4(struct annotated *p, int index, int fam_idx) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i64 [[TMP3]], i64 0
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP7]]
 //
-// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test4_bdos(
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -17179869180, 17179869181) i64 @test4_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -494,7 +494,7 @@ size_t test4_bdos(struct annotated *p, int index) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[DOT_COUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB9:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB8:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
@@ -545,7 +545,7 @@ void test5(struct anon_struct *p, int index) {
   p->array[index] = __builtin_dynamic_object_size(p, 1);
 }
 
-// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test5_bdos(
+// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 16, 1) i64 @test5_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -556,7 +556,7 @@ void test5(struct anon_struct *p, int index) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = select i1 [[DOTINV]], i64 0, i64 [[TMP1]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP2]]
 //
-// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test5_bdos(
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 16, 1) i64 @test5_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -590,7 +590,7 @@ size_t test5_bdos(struct anon_struct *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[DOT_COUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB10:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB9:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
@@ -683,7 +683,7 @@ size_t test6_bdos(struct anon_struct *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB12:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB11:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont7:
 // SANITIZE-WITH-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 9
@@ -756,7 +756,7 @@ size_t test7_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT9:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB13:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB12:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont9:
 // SANITIZE-WITH-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 9
@@ -797,7 +797,7 @@ void test8(struct union_of_fams *p, int index) {
   p->ints[index] = __builtin_dynamic_object_size(p->ints, 1);
 }
 
-// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test8_bdos(
+// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 256) i64 @test8_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -805,7 +805,7 @@ void test8(struct union_of_fams *p, int index) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i8 [[DOT_COUNTED_BY_LOAD]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP0]]
 //
-// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test8_bdos(
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 256) i64 @test8_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -955,7 +955,7 @@ void test10(struct union_of_fams *p, int index) {
   p->bytes[index] = (unsigned char)__builtin_dynamic_object_size(p->bytes, 1);
 }
 
-// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test10_bdos(
+// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -2147483648, 2147483648) i64 @test10_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -964,7 +964,7 @@ void test10(struct union_of_fams *p, int index) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[NARROW]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP0]]
 //
-// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test10_bdos(
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 -2147483648, 2147483648) i64 @test10_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
@@ -1095,10 +1095,10 @@ int test12_a, test12_b;
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[DOTCOUNTED_BY_LOAD]], 0
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label [[HANDLER_OUT_OF_BOUNDS4:%.*]], label [[HANDLER_TYPE_MISMATCH6:%.*]], !prof [[PROF10:![0-9]+]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds4:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB19:[0-9]+]], i64 0) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB20:[0-9]+]], i64 0) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.type_mismatch6:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB20:[0-9]+]], i64 ptrtoint (ptr getelementptr inbounds ([[STRUCT_ANON_5:%.*]], ptr @test12_foo, i64 1, i32 0, i32 0, i32 0) to i64)) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB21:[0-9]+]], i64 ptrtoint (ptr getelementptr inbounds ([[STRUCT_ANON_5:%.*]], ptr @test12_foo, i64 1, i32 0, i32 0, i32 0) to i64)) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i32 @test12(
@@ -1188,7 +1188,7 @@ struct test13_bar {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ugt i64 [[TMP1]], [[INDEX]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT5:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB23:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB24:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont5:
 // SANITIZE-WITH-ATTR-NEXT:    [[REVMAP:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
@@ -1249,7 +1249,7 @@ struct test14_foo {
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[TRAP:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB24:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB25:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       trap:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @llvm.trap() #[[ATTR10]]
@@ -1305,7 +1305,7 @@ int test14(int idx) {
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[TRAP:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB25:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB27:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       trap:
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @llvm.trap() #[[ATTR10]]
@@ -1326,7 +1326,7 @@ int test14(int idx) {
 // SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP0]], label [[TRAP:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR:       handler.out_of_bounds:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB10:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB11:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR:       trap:
 // SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @llvm.trap() #[[ATTR8]]
@@ -1487,7 +1487,7 @@ struct tests_foo {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i32 [[DOTCOUNTED_BY_LOAD]], 10
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT4:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB26:[0-9]+]], i64 10) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB28:[0-9]+]], i64 10) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont4:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[VAR]], i64 84
@@ -1528,7 +1528,7 @@ int test24(int c, struct tests_foo *var) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[DOTCOUNTED_BY_LOAD]], 10
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT5:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB27:[0-9]+]], i64 10) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB29:[0-9]+]], i64 10) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont5:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 44
@@ -1580,7 +1580,7 @@ struct test26_foo {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT5:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB28:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB30:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont5:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 8
@@ -1651,7 +1651,7 @@ struct test27_foo {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB30:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB32:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ENTRIES:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 24
@@ -1717,7 +1717,7 @@ struct test28_foo {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP4]], label [[CONT17:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB31:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB34:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont17:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 12
@@ -1779,7 +1779,7 @@ struct annotated_struct_array {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[IDX1]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB33:[0-9]+]], i64 [[TMP1]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB36:[0-9]+]], i64 [[TMP1]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x ptr], ptr [[ANN]], i64 0, i64 [[TMP1]]
@@ -1791,7 +1791,7 @@ struct annotated_struct_array {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[IDXPROM15]], [[TMP3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP4]], label [[CONT20:%.*]], label [[HANDLER_OUT_OF_BOUNDS16:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds16:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB34:[0-9]+]], i64 [[IDXPROM15]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB37:[0-9]+]], i64 [[IDXPROM15]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont20:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 12
@@ -1826,7 +1826,7 @@ struct annotated_struct_array {
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[IDX1]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT21:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB12:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB13:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR:       cont21:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x ptr], ptr [[ANN]], i64 0, i64 [[TMP1]]
diff --git a/clang/test/CodeGen/builtins-reduction-math.c b/clang/test/CodeGen/builtins-reduction-math.c
index 34f39cea5265..acafe9222d59 100644
--- a/clang/test/CodeGen/builtins-reduction-math.c
+++ b/clang/test/CodeGen/builtins-reduction-math.c
@@ -1,5 +1,8 @@
 // RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -O1 -triple aarch64 -target-feature +sve  %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=SVE   %s
+
 typedef float float4 __attribute__((ext_vector_type(4)));
 typedef short int si8 __attribute__((ext_vector_type(8)));
 typedef unsigned int u4 __attribute__((ext_vector_type(4)));
@@ -134,3 +137,53 @@ void test_builtin_reduce_and(si8 vi1, u4 vu1) {
   // CHECK-NEXT: call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[VU1]])
   unsigned r3 = __builtin_reduce_and(vu1);
 }
+
+#if defined(__ARM_FEATURE_SVE)
+#include <arm_sve.h>
+
+void test_builtin_reduce_SVE(int a, unsigned long long b, short c, float d) {
+  // SVE-LABEL: void @test_builtin_reduce_SVE(
+
+  svint32_t vec_a = svdup_s32(a);
+  svuint64_t vec_b = svdup_u64(b);
+  svint16_t vec_c1 = svdup_s16(c);
+  svuint16_t vec_c2 = svdup_u16(c);
+  svfloat32_t vec_d = svdup_f32(d);
+
+  // SVE:      [[VF1:%.+]] = load <vscale x 4 x i32>, ptr %vec_a
+  // SVE-NEXT: call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[VF1]])
+  int r1 = __builtin_reduce_add(vec_a);
+
+  // SVE:      [[VF2:%.+]] = load <vscale x 4 x i32>, ptr %vec_a
+  // SVE-NEXT: call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> [[VF2]])
+  int r2 = __builtin_reduce_mul(vec_a);
+
+  // SVE:      [[VF3:%.+]] = load <vscale x 2 x i64>, ptr %vec_b
+  // SVE-NEXT: call i64 @llvm.vector.reduce.xor.nxv2i64(<vscale x 2 x i64> [[VF3]])
+  long long r3 = __builtin_reduce_xor(vec_b);
+
+  // SVE:      [[VF4:%.+]] = load <vscale x 2 x i64>, ptr %vec_b
+  // SVE-NEXT: call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[VF4]])
+  long long r4 = __builtin_reduce_or(vec_b);
+
+  // SVE:      [[VF5:%.+]] = load <vscale x 2 x i64>, ptr %vec_b
+  // SVE-NEXT: call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> [[VF5]])
+  long long r5 = __builtin_reduce_and(vec_b);
+
+  // SVE:      [[VF6:%.+]] = load <vscale x 8 x i16>, ptr %vec_c1
+  // SVE-NEXT: call i16 @llvm.vector.reduce.smax.nxv8i16(<vscale x 8 x i16> [[VF6]])
+  short r6 = __builtin_reduce_max(vec_c1);
+
+  // SVE:      [[VF7:%.+]] = load <vscale x 8 x i16>, ptr %vec_c2
+  // SVE-NEXT: call i16 @llvm.vector.reduce.umin.nxv8i16(<vscale x 8 x i16> [[VF7]])
+  unsigned short r7 = __builtin_reduce_min(vec_c2);
+
+  // SVE:      [[VF8:%.+]] = load <vscale x 4 x float>, ptr %vec_d
+  // SVE-NEXT: call float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> [[VF8]])
+  float r8 = __builtin_reduce_max(vec_d);
+
+  // SVE:      [[VF9:%.+]] = load <vscale x 4 x float>, ptr %vec_d
+  // SVE-NEXT: call float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[VF9]])
+  float r9 = __builtin_reduce_min(vec_d);
+}
+#endif
diff --git a/clang/test/CodeGen/ms-mixed-ptr-sizes.c b/clang/test/CodeGen/ms-mixed-ptr-sizes.c
index 89d05fd30b72..51bea60eb39d 100644
--- a/clang/test/CodeGen/ms-mixed-ptr-sizes.c
+++ b/clang/test/CodeGen/ms-mixed-ptr-sizes.c
@@ -49,7 +49,7 @@ void test_other(struct Foo *f, __attribute__((address_space(10))) int *i) {
 }
 
 int test_compare1(int *__ptr32 __uptr i, int *__ptr64 j) {
-  // ALL-LABEL: define dso_local noundef i32 @test_compare1
+  // ALL-LABEL: define dso_local range(i32 0, 2) i32 @test_compare1
   // X64: %{{.+}} = addrspacecast ptr %j to ptr addrspace(271)
   // X64: %cmp = icmp eq ptr addrspace(271) %{{.+}}, %i
   // X86: %{{.+}} = addrspacecast ptr addrspace(272) %j to ptr addrspace(271)
@@ -58,7 +58,7 @@ int test_compare1(int *__ptr32 __uptr i, int *__ptr64 j) {
 }
 
 int test_compare2(int *__ptr32 __sptr i, int *__ptr64 j) {
-  // ALL-LABEL: define dso_local noundef i32 @test_compare2
+  // ALL-LABEL: define dso_local range(i32 0, 2) i32 @test_compare2
   // X64: %{{.+}} = addrspacecast ptr %j to ptr addrspace(270)
   // X64: %cmp = icmp eq ptr addrspace(270) %{{.+}}, %i
   // X86: %{{.+}} = addrspacecast ptr addrspace(272) %j to ptr
@@ -67,7 +67,7 @@ int test_compare2(int *__ptr32 __sptr i, int *__ptr64 j) {
 }
 
 int test_compare3(int *__ptr32 __uptr i, int *__ptr64 j) {
-  // ALL-LABEL: define dso_local noundef i32 @test_compare3
+  // ALL-LABEL: define dso_local range(i32 0, 2) i32 @test_compare3
   // X64: %{{.+}} = addrspacecast ptr addrspace(271) %i to ptr
   // X64: %cmp = icmp eq ptr %{{.+}}, %j
   // X86: %{{.+}} = addrspacecast ptr addrspace(271) %i to ptr addrspace(272)
@@ -76,7 +76,7 @@ int test_compare3(int *__ptr32 __uptr i, int *__ptr64 j) {
 }
 
 int test_compare4(int *__ptr32 __sptr i, int *__ptr64 j) {
-  // ALL-LABEL: define dso_local noundef i32 @test_compare4
+  // ALL-LABEL: define dso_local range(i32 0, 2) i32 @test_compare4
   // X64: %{{.+}} = addrspacecast ptr addrspace(270) %i to ptr
   // X64: %cmp = icmp eq ptr %{{.+}}, %j
   // X86: %{{.+}} = addrspacecast ptr %i to ptr addrspace(272)
diff --git a/clang/test/CodeGenCXX/blocks.cpp b/clang/test/CodeGenCXX/blocks.cpp
index eaab1890dfc4..afe078890553 100644
--- a/clang/test/CodeGenCXX/blocks.cpp
+++ b/clang/test/CodeGenCXX/blocks.cpp
@@ -149,8 +149,8 @@ namespace test5 {
   // CHECK-NEXT: [[X:%.*]] = alloca [[A:%.*]], align 4
   // CHECK-NEXT: [[B:%.*]] = alloca ptr, align 8
   // CHECK-NEXT: [[BLOCK:%.*]] = alloca [[BLOCK_T:.*]], align 8
-  // CHECK-NEXT: [[CLEANUP_ACTIVE:%.*]] = alloca i1
   // CHECK-NEXT: [[COND_CLEANUP_SAVE:%.*]] = alloca ptr, align 8
+  // CHECK-NEXT: [[CLEANUP_ACTIVE:%.*]] = alloca i1
   // CHECK-NEXT: [[T0:%.*]] = zext i1
   // CHECK-NEXT: store i8 [[T0]], ptr [[COND]], align 1
   // CHECK-NEXT: call void @_ZN5test51AC1Ev(ptr {{[^,]*}} [[X]])
@@ -162,8 +162,8 @@ namespace test5 {
   // CHECK-NOT:  br
   // CHECK:      [[CAPTURE:%.*]] = getelementptr inbounds [[BLOCK_T]], ptr [[BLOCK]], i32 0, i32 5
   // CHECK-NEXT: call void @_ZN5test51AC1ERKS0_(ptr {{[^,]*}} [[CAPTURE]], ptr noundef nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) [[X]])
-  // CHECK-NEXT: store i1 true, ptr [[CLEANUP_ACTIVE]]
   // CHECK-NEXT: store ptr [[CAPTURE]], ptr [[COND_CLEANUP_SAVE]], align 8
+  // CHECK-NEXT: store i1 true, ptr [[CLEANUP_ACTIVE]]
   // CHECK-NEXT: br label
   // CHECK:      br label
   // CHECK:      phi
diff --git a/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp b/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp
new file mode 100644
index 000000000000..ac466ee5bba4
--- /dev/null
+++ b/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp
@@ -0,0 +1,522 @@
+// RUN: %clang_cc1 --std=c++20 -fexceptions -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefixes=EH %s
+// RUN: %clang_cc1 --std=c++20 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefixes=NOEH,CHECK %s
+
+struct Printy {
+  Printy(const char *name) : name(name) {}
+  ~Printy() {}
+  const char *name;
+};
+
+int foo() { return 2; }
+
+struct Printies {
+  Printy a;
+  Printy b;
+  Printy c;
+};
+
+void ParenInit() {
+  // CHECK-LABEL: define dso_local void @_Z9ParenInitv()
+  // CHECK: [[CLEANUP_DEST:%.+]] = alloca i32, align 4
+  Printies ps(Printy("a"), 
+              // CHECK: call void @_ZN6PrintyC1EPKc
+              ({
+                if (foo()) return;
+                // CHECK:     if.then:
+                // CHECK-NEXT:   store i32 1, ptr [[CLEANUP_DEST]], align 4
+                // CHECK-NEXT:   br label %cleanup
+                Printy("b");
+                // CHECK:     if.end:
+                // CHECK-NEXT:  call void @_ZN6PrintyC1EPKc
+              }),
+              ({
+                if (foo()) return;
+                // CHECK:     if.then{{.*}}:
+                // CHECK-NEXT:  store i32 1, ptr [[CLEANUP_DEST]], align 4
+                // CHECK-NEXT:  call void @_ZN6PrintyD1Ev
+                // CHECK-NEXT:  br label %cleanup
+                Printy("c");
+                // CHECK:     if.end{{.*}}:
+                // CHECK-NEXT:  call void @_ZN6PrintyC1EPKc
+                // CHECK-NEXT:  call void @_ZN8PrintiesD1Ev
+                // CHECK-NEXT:  br label %return
+              }));
+  // CHECK:     cleanup:
+  // CHECK-NEXT:  call void @_ZN6PrintyD1Ev
+  // CHECK-NEXT:  br label %return
+}
+
+void break_in_stmt_expr() {
+  // Verify that the "break" in "if.then".calls dtor before jumping to "for.end".
+
+  // CHECK-LABEL: define dso_local void @_Z18break_in_stmt_exprv()
+  Printies p{Printy("a"), 
+            // CHECK: call void @_ZN6PrintyC1EPKc
+            ({
+                for (;;) {
+                    Printies ps{
+                      Printy("b"), 
+                      // CHECK: for.cond:
+                      // CHECK:   call void @_ZN6PrintyC1EPKc
+                      ({
+                        if (foo()) {
+                          break;
+                          // CHECK:       if.then:
+                          // CHECK-NEXT:    call void @_ZN6PrintyD1Ev
+                          // CHECK-NEXT:    br label %for.end
+                        }
+                        Printy("c");
+                        // CHECK:       if.end:
+                        // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+                      }),
+                      Printy("d")};
+                      // CHECK:           call void @_ZN6PrintyC1EPKc
+                      // CHECK-NEXT:      call void @_ZN8PrintiesD1Ev
+                      // CHECK-NEXT:      br label %for.cond
+                }
+                Printy("e");
+  // CHECK:       for.end:
+  // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+              }),
+              Printy("f")};
+  // CHECK:         call void @_ZN6PrintyC1EPKc
+  // CHECK-NEXT:    call void @_ZN8PrintiesD1Ev
+}
+
+void goto_in_stmt_expr() {
+  // Verify that:
+  //  - correct branch fixups for deactivated normal cleanups are generated correctly.
+
+  // CHECK-LABEL: define dso_local void @_Z17goto_in_stmt_exprv()
+  // CHECK: [[CLEANUP_DEST_SLOT:%cleanup.dest.slot.*]] = alloca i32, align 4
+  {
+    Printies p1{Printy("a"), // CHECK: call void @_ZN6PrintyC1EPKc
+                ({
+                  {
+                    Printies p2{Printy("b"),
+                                // CHECK: call void @_ZN6PrintyC1EPKc
+                                ({
+                                  if (foo() == 1) {
+                                    goto in;
+                                    // CHECK:       if.then:
+                                    // CHECK-NEXT:    store i32 2, ptr [[CLEANUP_DEST_SLOT]], align 4
+                                    // CHECK-NEXT:    br label %[[CLEANUP1:.+]]
+                                  }
+                                  if (foo() == 2) {
+                                    goto out;
+                                    // CHECK:       if.then{{.*}}:
+                                    // CHECK-NEXT:    store i32 3, ptr [[CLEANUP_DEST_SLOT]], align 4
+                                    // CHECK-NEXT:    br label %[[CLEANUP1]]
+                                  }
+                                  Printy("c");
+                                  // CHECK:       if.end{{.*}}:
+                                  // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+                                }),
+                                Printy("d")};
+                                // CHECK:           call void @_ZN6PrintyC1EPKc
+                                // CHECK-NEXT:      call void @_ZN8PrintiesD1Ev
+                                // CHECK-NEXT:      br label %in
+
+                  }
+                in:
+                  Printy("e");
+                // CHECK:       in:                                               ; preds = %if.end{{.*}}, %[[CLEANUP1]]
+                // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+                }),
+                Printy("f")};
+                // CHECK:         call void @_ZN6PrintyC1EPKc
+                // CHECK-NEXT:    call void @_ZN8PrintiesD1Ev
+                // CHECK-NEXT:    br label %out
+  }
+out:
+  return;
+  // CHECK:       out:
+  // CHECK-NEXT:    ret void
+
+  // CHECK:       [[CLEANUP1]]:                                          ; preds = %if.then{{.*}}, %if.then
+  // CHECK-NEXT:    call void @_ZN6PrintyD1Ev
+  // CHECK-NEXT:    %cleanup.dest = load i32, ptr [[CLEANUP_DEST_SLOT]], align 4
+  // CHECK-NEXT:    switch i32 %cleanup.dest, label %[[CLEANUP2:.+]] [
+  // CHECK-NEXT:      i32 2, label %in
+  // CHECK-NEXT:    ]
+
+  // CHECK:       [[CLEANUP2]]:                                         ; preds = %[[CLEANUP1]]
+  // CHECK-NEXT:    call void @_ZN6PrintyD1Ev
+  // CHECK-NEXT:    %cleanup.dest{{.*}} = load i32, ptr [[CLEANUP_DEST_SLOT]], align 4
+  // CHECK-NEXT:    switch i32 %cleanup.dest{{.*}}, label %unreachable [
+  // CHECK-NEXT:      i32 3, label %out
+  // CHECK-NEXT:    ]
+}
+
+void ArrayInit() {
+  // Printy arr[4] = {ctorA, ctorB, stmt-exprC, stmt-exprD};
+  // Verify that:
+  //  - We do the necessary stores for array cleanups (endOfInit and last constructed element).
+  //  - We update the array init element correctly for ctorA, ctorB and stmt-exprC.
+  //  - stmt-exprC and stmt-exprD share the array body dtor code (see %cleanup).
+
+  // CHECK-LABEL: define dso_local void @_Z9ArrayInitv()
+  // CHECK: %arrayinit.endOfInit = alloca ptr, align 8
+  // CHECK: %cleanup.dest.slot = alloca i32, align 4
+  // CHECK: %arrayinit.begin = getelementptr inbounds [4 x %struct.Printy], ptr %arr, i64 0, i64 0
+  // CHECK: store ptr %arrayinit.begin, ptr %arrayinit.endOfInit, align 8
+  Printy arr[4] = {
+    Printy("a"),
+    // CHECK: call void @_ZN6PrintyC1EPKc(ptr noundef nonnull align 8 dereferenceable(8) %arrayinit.begin, ptr noundef @.str)
+    // CHECK: [[ARRAYINIT_ELEMENT1:%.+]] = getelementptr inbounds %struct.Printy, ptr %arrayinit.begin, i64 1
+    // CHECK: store ptr [[ARRAYINIT_ELEMENT1]], ptr %arrayinit.endOfInit, align 8
+    Printy("b"),
+    // CHECK: call void @_ZN6PrintyC1EPKc(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_ELEMENT1]], ptr noundef @.str.1)
+    // CHECK: [[ARRAYINIT_ELEMENT2:%.+]] = getelementptr inbounds %struct.Printy, ptr [[ARRAYINIT_ELEMENT1]], i64 1
+    // CHECK: store ptr [[ARRAYINIT_ELEMENT2]], ptr %arrayinit.endOfInit, align 8
+    ({
+    // CHECK: br i1 {{.*}}, label %if.then, label %if.end
+      if (foo()) {
+        return;
+      // CHECK:       if.then:
+      // CHECK-NEXT:    store i32 1, ptr %cleanup.dest.slot, align 4
+      // CHECK-NEXT:    br label %cleanup
+      }
+      // CHECK:       if.end:
+      Printy("c");
+      // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+      // CHECK-NEXT:    %arrayinit.element2 = getelementptr inbounds %struct.Printy, ptr %arrayinit.element1, i64 1
+      // CHECK-NEXT:    store ptr %arrayinit.element2, ptr %arrayinit.endOfInit, align 8
+    }),
+    ({
+    // CHECK: br i1 {{%.+}} label %[[IF_THEN2:.+]], label %[[IF_END2:.+]]
+      if (foo()) {
+        return;
+      // CHECK:       [[IF_THEN2]]:
+      // CHECK-NEXT:    store i32 1, ptr %cleanup.dest.slot, align 4
+      // CHECK-NEXT:    br label %cleanup
+      }
+      // CHECK:       [[IF_END2]]:
+      Printy("d");
+      // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+      // CHECK-NEXT:    %array.begin = getelementptr inbounds [4 x %struct.Printy], ptr %arr, i32 0, i32 0
+      // CHECK-NEXT:    %0 = getelementptr inbounds %struct.Printy, ptr %array.begin, i64 4
+      // CHECK-NEXT:    br label %[[ARRAY_DESTROY_BODY1:.+]]
+  }),
+  };
+
+  // CHECK:       [[ARRAY_DESTROY_BODY1]]:
+  // CHECK-NEXT:    %arraydestroy.elementPast{{.*}} = phi ptr [ %0, %[[IF_END2]] ], [ %arraydestroy.element{{.*}}, %[[ARRAY_DESTROY_BODY1]] ]
+  // CHECK-NEXT:    %arraydestroy.element{{.*}} = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast{{.*}}, i64 -1
+  // CHECK-NEXT:    call void @_ZN6PrintyD1Ev
+  // CHECK-NEXT:    %arraydestroy.done{{.*}} = icmp eq ptr %arraydestroy.element{{.*}}, %array.begin
+  // CHECK-NEXT:    br i1 %arraydestroy.done{{.*}}, label %[[ARRAY_DESTROY_DONE1:.+]], label %[[ARRAY_DESTROY_BODY1]]
+
+  // CHECK:       [[ARRAY_DESTROY_DONE1]]:
+  // CHECK-NEXT:    ret void
+
+  // CHECK:       cleanup:
+  // CHECK-NEXT:    %1 = load ptr, ptr %arrayinit.endOfInit, align 8
+  // CHECK-NEXT:    %arraydestroy.isempty = icmp eq ptr %arrayinit.begin, %1
+  // CHECK-NEXT:    br i1 %arraydestroy.isempty, label %[[ARRAY_DESTROY_DONE2:.+]], label %[[ARRAY_DESTROY_BODY2:.+]]
+
+  // CHECK:       [[ARRAY_DESTROY_BODY2]]:
+  // CHECK-NEXT:    %arraydestroy.elementPast = phi ptr [ %1, %cleanup ], [ %arraydestroy.element, %[[ARRAY_DESTROY_BODY2]] ]
+  // CHECK-NEXT:    %arraydestroy.element = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast, i64 -1
+  // CHECK-NEXT:    call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element)
+  // CHECK-NEXT:    %arraydestroy.done = icmp eq ptr %arraydestroy.element, %arrayinit.begin
+  // CHECK-NEXT:    br i1 %arraydestroy.done, label %[[ARRAY_DESTROY_DONE2]], label %[[ARRAY_DESTROY_BODY2]]
+
+  // CHECK:       [[ARRAY_DESTROY_DONE2]]:
+  // CHECK-NEXT:    br label %[[ARRAY_DESTROY_DONE1]]
+}
+
+void ArraySubobjects() {
+  struct S {
+    Printy arr1[2];
+    Printy arr2[2];
+    Printy p;
+  };
+  // CHECK-LABEL: define dso_local void @_Z15ArraySubobjectsv()
+  // CHECK: %arrayinit.endOfInit = alloca ptr, align 8
+  S s{{Printy("a"), Printy("b")},
+      // CHECK: call void @_ZN6PrintyC1EPKc
+      // CHECK: call void @_ZN6PrintyC1EPKc
+      {Printy("a"),
+      // CHECK: [[ARRAYINIT_BEGIN:%.+]] = getelementptr inbounds [2 x %struct.Printy]
+      // CHECK: store ptr [[ARRAYINIT_BEGIN]], ptr %arrayinit.endOfInit, align 8
+      // CHECK: call void @_ZN6PrintyC1EPKc
+      // CHECK: [[ARRAYINIT_ELEMENT:%.+]] = getelementptr inbounds %struct.Printy
+      // CHECK: store ptr [[ARRAYINIT_ELEMENT]], ptr %arrayinit.endOfInit, align 8
+      ({
+         if (foo()) {
+           return;
+           // CHECK:      if.then:
+           // CHECK-NEXT:   [[V0:%.+]] = load ptr, ptr %arrayinit.endOfInit, align 8
+           // CHECK-NEXT:   %arraydestroy.isempty = icmp eq ptr [[ARRAYINIT_BEGIN]], [[V0]]
+           // CHECK-NEXT:   br i1 %arraydestroy.isempty, label %[[ARRAY_DESTROY_DONE:.+]], label %[[ARRAY_DESTROY_BODY:.+]]
+         }
+         Printy("b");
+       })
+      },
+      Printy("c")
+      // CHECK:       if.end:
+      // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+      // CHECK:         call void @_ZN6PrintyC1EPKc
+      // CHECK-NEXT:    call void @_ZZ15ArraySubobjectsvEN1SD1Ev
+      // CHECK-NEXT:    br label %return
+    };
+    // CHECK:       return:
+    // CHECK-NEXT:    ret void
+
+    // CHECK:       [[ARRAY_DESTROY_BODY]]:
+    // CHECK-NEXT:    %arraydestroy.elementPast = phi ptr [ %0, %if.then ], [ %arraydestroy.element, %[[ARRAY_DESTROY_BODY]] ]
+    // CHECK-NEXT:    %arraydestroy.element = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast, i64 -1
+    // CHECK-NEXT:    call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element)
+    // CHECK-NEXT:    %arraydestroy.done = icmp eq ptr %arraydestroy.element, [[ARRAYINIT_BEGIN]]
+    // CHECK-NEXT:    br i1 %arraydestroy.done, label %[[ARRAY_DESTROY_DONE]], label %[[ARRAY_DESTROY_BODY]]
+
+    // CHECK:       [[ARRAY_DESTROY_DONE]]
+    // CHECK-NEXT:    [[ARRAY_BEGIN:%.+]] = getelementptr inbounds [2 x %struct.Printy], ptr %arr1, i32 0, i32 0
+    // CHECK-NEXT:    [[V1:%.+]] = getelementptr inbounds %struct.Printy, ptr [[ARRAY_BEGIN]], i64 2
+    // CHECK-NEXT:    br label %[[ARRAY_DESTROY_BODY2:.+]]
+
+    // CHECK:       [[ARRAY_DESTROY_BODY2]]:
+    // CHECK-NEXT:    %arraydestroy.elementPast5 = phi ptr [ %1, %[[ARRAY_DESTROY_DONE]] ], [ %arraydestroy.element6, %[[ARRAY_DESTROY_BODY2]] ]
+    // CHECK-NEXT:    %arraydestroy.element6 = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast5, i64 -1
+    // CHECK-NEXT:    call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element6)
+    // CHECK-NEXT:    %arraydestroy.done7 = icmp eq ptr %arraydestroy.element6, [[ARRAY_BEGIN]]
+    // CHECK-NEXT:    br i1 %arraydestroy.done7, label %[[ARRAY_DESTROY_DONE2:.+]], label %[[ARRAY_DESTROY_BODY2]]
+
+
+    // CHECK:     [[ARRAY_DESTROY_DONE2]]:
+    // CHECK-NEXT:  br label %return
+}
+
+void LambdaInit() {
+  // CHECK-LABEL: define dso_local void @_Z10LambdaInitv()
+  auto S = [a = Printy("a"), b = ({
+                               if (foo()) {
+                                 return;
+                                 // CHECK:       if.then:
+                                 // CHECK-NEXT:    call void @_ZN6PrintyD1Ev
+                                 // CHECK-NEXT:    br label %return
+                               }
+                               Printy("b");
+                             })]() { return a; };
+}
+
+struct PrintyRefBind {
+  const Printy &a;
+  const Printy &b;
+};
+
+struct Temp {
+  Temp();
+  ~Temp();
+};
+Temp CreateTemp();
+Printy CreatePrinty();
+Printy CreatePrinty(const Temp&);
+
+void LifetimeExtended() {
+  // CHECK-LABEL: define dso_local void @_Z16LifetimeExtendedv
+  PrintyRefBind ps = {Printy("a"), ({
+                        if (foo()) {
+                          return;
+                          // CHECK: if.then:
+                          // CHECK-NEXT: call void @_ZN6PrintyD1Ev
+                          // CHECK-NEXT: br label %return
+                        }
+                        Printy("b");
+                      })};
+}
+
+void ConditionalLifetimeExtended() {
+  // CHECK-LABEL: @_Z27ConditionalLifetimeExtendedv()
+
+  // Verify that we create two cleanup flags.
+  //  1. First for the cleanup which is deactivated after full expression.
+  //  2. Second for the life-ext cleanup which is activated if the branch is taken.
+
+  // Note: We use `CreateTemp()` to ensure that life-ext destroy cleanup is not at
+  // the top of EHStack on deactivation. This ensures using active flags.
+
+  Printy* p1 = nullptr;
+  // CHECK:       store i1 false, ptr [[BRANCH1_DEFERRED:%cleanup.cond]], align 1
+  // CHECK-NEXT:  store i1 false, ptr [[BRANCH1_LIFEEXT:%cleanup.cond.*]], align 1
+  PrintyRefBind ps = {
+      p1 != nullptr ? static_cast<const Printy&>(CreatePrinty())
+      // CHECK:       cond.true:
+      // CHECK-NEXT:    call void @_Z12CreatePrintyv
+      // CHECK-NEXT:    store i1 true, ptr [[BRANCH1_DEFERRED]], align 1
+      // CHECK-NEXT:    store i1 true, ptr [[BRANCH1_LIFEEXT]], align 1
+      // CHECK-NEXT:    br label %{{.*}}
+      : foo() ? static_cast<const Printy&>(CreatePrinty(CreateTemp()))
+              : *p1,
+      ({
+        if (foo()) return;
+        Printy("c");
+        // CHECK:       if.end:
+        // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
+        // CHECK-NEXT:    store ptr
+      })};
+      // CHECK-NEXT:      store i1 false, ptr [[BRANCH1_DEFERRED]], align 1
+      // CHECK-NEXT:      store i32 0, ptr %cleanup.dest.slot, align 4
+      // CHECK-NEXT:      br label %cleanup
+
+}
+
+void NewArrayInit() {
+  // CHECK-LABEL: define dso_local void @_Z12NewArrayInitv()
+  // CHECK: %array.init.end = alloca ptr, align 8
+  // CHECK: store ptr %0, ptr %array.init.end, align 8
+  Printy *array = new Printy[3]{
+    "a",
+    // CHECK: call void @_ZN6PrintyC1EPKc
+    // CHECK: store ptr %array.exp.next, ptr %array.init.end, align 8
+    "b", 
+    // CHECK: call void @_ZN6PrintyC1EPKc
+    // CHECK: store ptr %array.exp.next1, ptr %array.init.end, align 8
+    ({
+        if (foo()) {
+          return;
+          // CHECK: if.then:
+          // CHECK:   br i1 %arraydestroy.isempty, label %arraydestroy.done{{.*}}, label %arraydestroy.body
+        }
+        "b";
+        // CHECK: if.end:
+        // CHECK:   call void @_ZN6PrintyC1EPKc
+    })};
+  // CHECK:       arraydestroy.body:
+  // CHECK-NEXT:    %arraydestroy.elementPast = phi ptr [ %{{.*}}, %if.then ], [ %arraydestroy.element, %arraydestroy.body ]
+  // CHECK-NEXT:    %arraydestroy.element = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast, i64 -1
+  // CHECK-NEXT:    call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element)
+  // CHECK-NEXT:    %arraydestroy.done = icmp eq ptr %arraydestroy.element, %0
+  // CHECK-NEXT:    br i1 %arraydestroy.done, label %arraydestroy.done{{.*}}, label %arraydestroy.body
+
+  // CHECK:       arraydestroy.done{{.*}}:                               ; preds = %arraydestroy.body, %if.then
+  // CHECK-NEXT:    br label %return
+}
+
+void DestroyInConditionalCleanup() {
+  // EH-LABEL: DestroyInConditionalCleanupv()
+  // NOEH-LABEL: DestroyInConditionalCleanupv()
+  struct A {
+    A() {}
+    ~A() {}
+  };
+
+  struct Value {
+    Value(A) {}
+    ~Value() {}
+  };
+
+  struct V2 {
+    Value K;
+    Value V;
+  };
+  // Verify we use conditional cleanups.
+  (void)(foo() ? V2{A(), A()} : V2{A(), A()});
+  // NOEH:   cond.true:
+  // NOEH:      call void @_ZZ27DestroyInConditionalCleanupvEN1AC1Ev
+  // NOEH:      store ptr %{{.*}}, ptr %cond-cleanup.save
+
+  // EH:   cond.true:
+  // EH:        invoke void @_ZZ27DestroyInConditionalCleanupvEN1AC1Ev
+  // EH:        store ptr %{{.*}}, ptr %cond-cleanup.save
+}
+
+void ArrayInitWithContinue() {
+  // CHECK-LABEL: @_Z21ArrayInitWithContinuev
+  // Verify that we start to emit the array destructor.
+  // CHECK: %arrayinit.endOfInit = alloca ptr, align 8
+  for (int i = 0; i < 1; ++i) {
+    Printy arr[2] = {"a", ({
+                       if (foo()) {
+                         continue;
+                       }
+                       "b";
+                     })};
+  }
+}
+
+struct [[clang::trivial_abi]] HasTrivialABI {
+  HasTrivialABI();
+  ~HasTrivialABI();
+};
+void AcceptTrivialABI(HasTrivialABI, int);
+void TrivialABI() {
+  // CHECK-LABEL: define dso_local void @_Z10TrivialABIv()
+  AcceptTrivialABI(HasTrivialABI(), ({
+                     if (foo()) return;
+                     // CHECK:      if.then:
+                     // CHECK-NEXT:   call void @_ZN13HasTrivialABID1Ev
+                     // CHECK-NEXT:   br label %return
+                     0;
+                   }));
+}
+
+namespace CleanupFlag {
+struct A {
+  A() {}
+  ~A() {}
+};
+
+struct B {
+  B(const A&) {}
+  B() {}
+  ~B() {}
+};
+
+struct S {
+  A a;
+  B b;
+};
+
+int AcceptS(S s);
+
+void Accept2(int x, int y);
+
+void InactiveNormalCleanup() {
+  // CHECK-LABEL: define {{.*}}InactiveNormalCleanupEv()
+  
+  // The first A{} below is an inactive normal cleanup which
+  // is not popped from EHStack on deactivation. This needs an
+  // "active" cleanup flag.
+
+  // CHECK: [[ACTIVE:%cleanup.isactive.*]] = alloca i1, align 1
+  // CHECK: call void [[A_CTOR:@.*AC1Ev]]
+  // CHECK: store i1 true, ptr [[ACTIVE]], align 1
+  // CHECK: call void [[A_CTOR]]
+  // CHECK: call void [[B_CTOR:@.*BC1ERKNS_1AE]]
+  // CHECK: store i1 false, ptr [[ACTIVE]], align 1
+  // CHECK: call noundef i32 [[ACCEPTS:@.*AcceptSENS_1SE]]
+  Accept2(AcceptS({.a = A{}, .b = A{}}), ({
+            if (foo()) return;
+            // CHECK: if.then:
+            // CHECK:   br label %cleanup
+            0;
+            // CHECK: if.end:
+            // CHECK:   call void [[ACCEPT2:@.*Accept2Eii]]
+            // CHECK:   br label %cleanup
+          }));
+  // CHECK: cleanup:
+  // CHECK:   call void [[S_DTOR:@.*SD1Ev]]
+  // CHECK:   call void [[A_DTOR:@.*AD1Ev]]
+  // CHECK:   %cleanup.is_active = load i1, ptr [[ACTIVE]]
+  // CHECK:   br i1 %cleanup.is_active, label %cleanup.action, label %cleanup.done
+
+  // CHECK: cleanup.action:
+  // CHECK:   call void [[A_DTOR]]
+
+  // The "active" cleanup flag is not required for unused cleanups.
+  Accept2(AcceptS({.a = A{}, .b = A{}}), 0);
+  // CHECK: cleanup.cont:
+  // CHECK:   call void [[A_CTOR]]
+  // CHECK-NOT: store i1 true
+  // CHECK:   call void [[A_CTOR]]
+  // CHECK:   call void [[B_CTOR]]
+  // CHECK-NOT: store i1 false
+  // CHECK:   call noundef i32 [[ACCEPTS]]
+  // CHECK:   call void [[ACCEPT2]]
+  // CHECK:   call void [[S_DTOR]]
+  // CHECK:   call void [[A_DTOR]]
+  // CHECK:   br label %return
+}
+}  // namespace CleanupFlag
diff --git a/clang/test/CodeGenCXX/mangle.cpp b/clang/test/CodeGenCXX/mangle.cpp
index d0800af55c87..31467d943840 100644
--- a/clang/test/CodeGenCXX/mangle.cpp
+++ b/clang/test/CodeGenCXX/mangle.cpp
@@ -1032,6 +1032,10 @@ namespace test51 {
   template <typename T>
   decltype(S1<T>().~S1<T>(), S1<T>().~S1<T>()) fun4() {};
   template <typename T>
+  decltype(S1<int>().~S1<T>()) fun5(){};
+  template <template <typename T> class U>
+  decltype(S1<int>().~U<int>()) fun6(){};
+  template <typename T>
   decltype(E().E::~T()) fun7() {}
   template <template <typename> class U>
   decltype(X<int>::Y().U<int>::Y::~Y()) fun8() {}
@@ -1043,6 +1047,10 @@ namespace test51 {
   // CHECK-LABEL: @_ZN6test514fun3I2S1IiEiEEDTcldtcvS1_IT0_E_EdnT_EEv
   template void fun4<int>();
   // CHECK-LABEL: @_ZN6test514fun4IiEEDTcmcldtcv2S1IT_E_Edn2S1IS2_EEcldtcvS3__Edn2S1IS2_EEEv
+  template void fun5<int>();
+  // CHECK-LABEL: @_ZN6test514fun5IiEEDTcldtcv2S1IiE_Edn2S1IT_EEEv
+  template void fun6<S1>();
+  // CHECK-LABEL: @_ZN6test514fun6I2S1EEDTcldtcvS1_IiE_EdnT_IiEEEv
   template void fun7<E>();
   // CHECK-LABEL: @_ZN6test514fun7INS_1EEEEDTcldtcvS1__Esr1EEdnT_EEv
   template void fun8<X>();
diff --git a/clang/test/CodeGenCXX/pragma-gcc-unroll.cpp b/clang/test/CodeGenCXX/pragma-gcc-unroll.cpp
index 8a94a5cc91e2..85f10fcdff14 100644
--- a/clang/test/CodeGenCXX/pragma-gcc-unroll.cpp
+++ b/clang/test/CodeGenCXX/pragma-gcc-unroll.cpp
@@ -116,6 +116,34 @@ void while_unroll_zero_test(int *List, int Length) {
   }
 }
 
+using size_t = unsigned long long;
+
+template <bool Flag>
+int value_dependent(int n) {
+  // CHECK: define {{.*}} @_Z15value_dependentILb1EEii
+  constexpr int N = 100;
+  auto init = [=]() { return Flag ? n : 0UL; };
+  auto cond = [=](size_t ix) { return Flag ? ix != 0 : ix < 10; };
+  auto iter = [=](size_t ix) {
+    return Flag ? ix & ~(1ULL << __builtin_clzll(ix)) : ix + 1;
+  };
+#pragma GCC unroll Flag ? 1 : N
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_16:.*]]
+    n *= n;
+  }
+#pragma GCC unroll Flag ? 0 : N
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_17:.*]]
+    n *= n;
+  }
+  return n;
+}
+
+void test_value_dependent(int n) {
+  value_dependent<true>(n);
+}
+
 // CHECK: ![[LOOP_1]] = distinct !{![[LOOP_1]], [[MP:![0-9]+]], ![[UNROLL_ENABLE:.*]]}
 // CHECK: ![[UNROLL_ENABLE]] = !{!"llvm.loop.unroll.enable"}
 // CHECK: ![[LOOP_2]] = distinct !{![[LOOP_2:.*]], ![[UNROLL_DISABLE:.*]]}
@@ -129,3 +157,5 @@ void while_unroll_zero_test(int *List, int Length) {
 // CHECK: ![[LOOP_7]] = distinct !{![[LOOP_7]], ![[UNROLL_8:.*]]}
 // CHECK: ![[LOOP_14]] = distinct !{![[LOOP_14]], [[MP]], ![[UNROLL_DISABLE:.*]]}
 // CHECK: ![[LOOP_15]] = distinct !{![[LOOP_15]], [[MP]], ![[UNROLL_DISABLE:.*]]}
+// CHECK: ![[LOOP_16]] = distinct !{![[LOOP_16]], [[MP]], ![[UNROLL_DISABLE:.*]]}
+// CHECK: ![[LOOP_17]] = distinct !{![[LOOP_17]], [[MP]], ![[UNROLL_DISABLE:.*]]}
diff --git a/clang/test/CodeGenCXX/pragma-unroll.cpp b/clang/test/CodeGenCXX/pragma-unroll.cpp
index 02d9bad7148d..6754788b7243 100644
--- a/clang/test/CodeGenCXX/pragma-unroll.cpp
+++ b/clang/test/CodeGenCXX/pragma-unroll.cpp
@@ -96,6 +96,54 @@ void template_test(double *List, int Length) {
   for_template_define_test<double>(List, Length, Value);
 }
 
+void for_unroll_zero_test(int *List, int Length) {
+  // CHECK: define {{.*}} @_Z20for_unroll_zero_testPii
+  #pragma unroll 0
+  for (int i = 0; i < Length; i++) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_14:.*]]
+    List[i] = i * 2;
+  }
+}
+
+void while_unroll_zero_test(int *List, int Length) {
+  // CHECK: define {{.*}} @_Z22while_unroll_zero_testPii
+  int i = 0;
+#pragma unroll(0)
+  while (i < Length) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_15:.*]]
+    List[i] = i * 2;
+    i++;
+  }
+}
+
+using size_t = unsigned long long;
+
+template <bool Flag>
+int value_dependent(int n) {
+  // CHECK: define {{.*}} @_Z15value_dependentILb1EEii
+  constexpr int N = 100;
+  auto init = [=]() { return Flag ? n : 0UL; };
+  auto cond = [=](size_t ix) { return Flag ? ix != 0 : ix < 10; };
+  auto iter = [=](size_t ix) {
+    return Flag ? ix & ~(1ULL << __builtin_clzll(ix)) : ix + 1;
+  };
+#pragma unroll Flag ? 1 : N
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_16:.*]]
+    n *= n;
+  }
+#pragma unroll Flag ? 0 : N
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_17:.*]]
+    n *= n;
+  }
+  return n;
+}
+
+void test_value_dependent(int n) {
+  value_dependent<true>(n);
+}
+
 // CHECK: ![[LOOP_1]] = distinct !{![[LOOP_1]], [[MP:![0-9]+]], ![[UNROLL_ENABLE:.*]]}
 // CHECK: ![[UNROLL_ENABLE]] = !{!"llvm.loop.unroll.enable"}
 // CHECK: ![[LOOP_2]] = distinct !{![[LOOP_2:.*]], ![[UNROLL_DISABLE:.*]]}
@@ -107,3 +155,7 @@ void template_test(double *List, int Length) {
 // CHECK: ![[LOOP_5]] = distinct !{![[LOOP_5]], ![[UNROLL_8:.*]]}
 // CHECK: ![[LOOP_6]] = distinct !{![[LOOP_6]], ![[UNROLL_8:.*]]}
 // CHECK: ![[LOOP_7]] = distinct !{![[LOOP_7]], ![[UNROLL_8:.*]]}
+// CHECK: ![[LOOP_14]] = distinct !{![[LOOP_14]], [[MP]], ![[UNROLL_DISABLE:.*]]}
+// CHECK: ![[LOOP_15]] = distinct !{![[LOOP_15]], [[MP]], ![[UNROLL_DISABLE:.*]]}
+// CHECK: ![[LOOP_16]] = distinct !{![[LOOP_16]], [[MP]], ![[UNROLL_DISABLE:.*]]}
+// CHECK: ![[LOOP_17]] = distinct !{![[LOOP_17]], [[MP]], ![[UNROLL_DISABLE:.*]]}
diff --git a/clang/test/CodeGenCoroutines/coro-elide-thinlto.cpp b/clang/test/CodeGenCoroutines/coro-elide-thinlto.cpp
new file mode 100644
index 000000000000..293aef678167
--- /dev/null
+++ b/clang/test/CodeGenCoroutines/coro-elide-thinlto.cpp
@@ -0,0 +1,77 @@
+// This tests that the coroutine elide optimization could happen succesfully with ThinLTO.
+// This test is adapted from coro-elide.cpp and splits functions into two files.
+//
+// RUN: split-file %s %t
+// RUN: %clang --target=x86_64-linux -std=c++20 -O2 -flto=thin -I %S -c %t/coro-elide-callee.cpp -o coro-elide-callee.o
+// RUN: %clang --target=x86_64-linux -std=c++20 -O2 -flto=thin -I %S -c %t/coro-elide-caller.cpp -o coro-elide-caller.o
+// RUN: llvm-lto -thinlto coro-elide-callee.o coro-elide-caller.o -o summary
+// RUN: %clang_cc1 -O2 -x ir coro-elide-caller.o -fthinlto-index=summary.thinlto.bc -emit-llvm -o - | FileCheck %s
+
+//--- coro-elide-task.h
+#pragma once
+#include "Inputs/coroutine.h"
+
+struct Task {
+  struct promise_type {
+    struct FinalAwaiter {
+      bool await_ready() const noexcept { return false; }
+      template <typename PromiseType>
+      std::coroutine_handle<> await_suspend(std::coroutine_handle<PromiseType> h) noexcept {
+        if (!h)
+          return std::noop_coroutine();
+        return h.promise().continuation;
+      }
+      void await_resume() noexcept {}
+    };
+    Task get_return_object() noexcept {
+      return std::coroutine_handle<promise_type>::from_promise(*this);
+    }
+    std::suspend_always initial_suspend() noexcept { return {}; }
+    FinalAwaiter final_suspend() noexcept { return {}; }
+    void unhandled_exception() noexcept {}
+    void return_value(int x) noexcept {
+      _value = x;
+    }
+    std::coroutine_handle<> continuation;
+    int _value;
+  };
+
+  Task(std::coroutine_handle<promise_type> handle) : handle(handle) {}
+  ~Task() {
+    if (handle)
+      handle.destroy();
+  }
+
+  struct Awaiter {
+    bool await_ready() const noexcept { return false; }
+    void await_suspend(std::coroutine_handle<void> continuation) noexcept {}
+    int await_resume() noexcept {
+      return 43;
+    }
+  };
+
+  auto operator co_await() {
+    return Awaiter{};
+  }
+
+private:
+  std::coroutine_handle<promise_type> handle;
+};
+
+//--- coro-elide-callee.cpp
+#include "coro-elide-task.h"
+Task task0() {
+  co_return 43;
+}
+
+//--- coro-elide-caller.cpp
+#include "coro-elide-task.h"
+
+Task task0();
+
+Task task1() {
+  co_return co_await task0();
+}
+
+// CHECK-LABEL: define{{.*}} void @_Z5task1v.resume
+// CHECK-NOT: {{.*}}_Znwm
diff --git a/clang/test/CodeGenCoroutines/coro-suspend-cleanups.cpp b/clang/test/CodeGenCoroutines/coro-suspend-cleanups.cpp
new file mode 100644
index 000000000000..06cc2069dbe9
--- /dev/null
+++ b/clang/test/CodeGenCoroutines/coro-suspend-cleanups.cpp
@@ -0,0 +1,93 @@
+// RUN: %clang_cc1 --std=c++20 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s
+
+#include "Inputs/coroutine.h"
+
+struct Printy {
+  Printy(const char *name) : name(name) {}
+  ~Printy() {}
+  const char *name;
+};
+
+struct coroutine {
+  struct promise_type;
+  std::coroutine_handle<promise_type> handle;
+  ~coroutine() {
+    if (handle) handle.destroy();
+  }
+};
+
+struct coroutine::promise_type {
+  coroutine get_return_object() {
+    return {std::coroutine_handle<promise_type>::from_promise(*this)};
+  }
+  std::suspend_never initial_suspend() noexcept { return {}; }
+  std::suspend_always final_suspend() noexcept { return {}; }
+  void return_void() {}
+  void unhandled_exception() {}
+};
+
+struct Awaiter : std::suspend_always {
+  Printy await_resume() { return {"awaited"}; }
+};
+
+int foo() { return 2; }
+
+coroutine ArrayInitCoro() {
+  // Verify that:
+  //  - We do the necessary stores for array cleanups.
+  //  - Array cleanups are called by await.cleanup.
+  //  - We activate the cleanup after the first element and deactivate it in await.ready (see cleanup.isactive).
+
+  // CHECK-LABEL: define dso_local void @_Z13ArrayInitCorov
+  // CHECK: %arrayinit.endOfInit = alloca ptr, align 8
+  // CHECK: %cleanup.isactive = alloca i1, align 1
+  Printy arr[2] = {
+    Printy("a"),
+    // CHECK:       %arrayinit.begin = getelementptr inbounds [2 x %struct.Printy], ptr %arr.reload.addr, i64 0, i64 0
+    // CHECK-NEXT:  %arrayinit.begin.spill.addr = getelementptr inbounds %_Z13ArrayInitCorov.Frame, ptr %0, i32 0, i32 10
+    // CHECK-NEXT:  store ptr %arrayinit.begin, ptr %arrayinit.begin.spill.addr, align 8
+    // CHECK-NEXT:  store i1 true, ptr %cleanup.isactive.reload.addr, align 1
+    // CHECK-NEXT:  store ptr %arrayinit.begin, ptr %arrayinit.endOfInit.reload.addr, align 8
+    // CHECK-NEXT:  call void @_ZN6PrintyC1EPKc(ptr noundef nonnull align 8 dereferenceable(8) %arrayinit.begin, ptr noundef @.str)
+    // CHECK-NEXT:  %arrayinit.element = getelementptr inbounds %struct.Printy, ptr %arrayinit.begin, i64 1
+    // CHECK-NEXT:  %arrayinit.element.spill.addr = getelementptr inbounds %_Z13ArrayInitCorov.Frame, ptr %0, i32 0, i32 11
+    // CHECK-NEXT:  store ptr %arrayinit.element, ptr %arrayinit.element.spill.addr, align 8
+    // CHECK-NEXT:  store ptr %arrayinit.element, ptr %arrayinit.endOfInit.reload.addr, align 8
+    co_await Awaiter{}
+    // CHECK-NEXT:  @_ZNSt14suspend_always11await_readyEv
+    // CHECK-NEXT:  br i1 %{{.+}}, label %await.ready, label %CoroSave30
+  };
+  // CHECK:       await.cleanup:                                    ; preds = %AfterCoroSuspend{{.*}}
+  // CHECK-NEXT:    br label %cleanup{{.*}}.from.await.cleanup
+
+  // CHECK:       cleanup{{.*}}.from.await.cleanup:                      ; preds = %await.cleanup
+  // CHECK:         br label %cleanup{{.*}}
+
+  // CHECK:       await.ready:
+  // CHECK-NEXT:    %arrayinit.element.reload.addr = getelementptr inbounds %_Z13ArrayInitCorov.Frame, ptr %0, i32 0, i32 11
+  // CHECK-NEXT:    %arrayinit.element.reload = load ptr, ptr %arrayinit.element.reload.addr, align 8
+  // CHECK-NEXT:    call void @_ZN7Awaiter12await_resumeEv
+  // CHECK-NEXT:    store i1 false, ptr %cleanup.isactive.reload.addr, align 1
+  // CHECK-NEXT:    br label %cleanup{{.*}}.from.await.ready
+
+  // CHECK:       cleanup{{.*}}:                                         ; preds = %cleanup{{.*}}.from.await.ready, %cleanup{{.*}}.from.await.cleanup
+  // CHECK:         %cleanup.is_active = load i1, ptr %cleanup.isactive.reload.addr, align 1
+  // CHECK-NEXT:    br i1 %cleanup.is_active, label %cleanup.action, label %cleanup.done
+
+  // CHECK:       cleanup.action:
+  // CHECK:         %arraydestroy.isempty = icmp eq ptr %arrayinit.begin.reload{{.*}}, %{{.*}}
+  // CHECK-NEXT:    br i1 %arraydestroy.isempty, label %arraydestroy.done{{.*}}, label %arraydestroy.body.from.cleanup.action
+  // Ignore rest of the array cleanup.
+}
+
+coroutine ArrayInitWithCoReturn() {
+  // CHECK-LABEL: define dso_local void @_Z21ArrayInitWithCoReturnv
+  // Verify that we start to emit the array destructor.
+  // CHECK: %arrayinit.endOfInit = alloca ptr, align 8
+  Printy arr[2] = {"a", ({
+                      if (foo()) {
+                        co_return;
+                      }
+                      "b";
+                    })};
+}
diff --git a/clang/test/CodeGenObjC/arc-blocks-exceptions.m b/clang/test/CodeGenObjC/arc-blocks-exceptions.m
index 821b818d4027..54b043d8ea07 100644
--- a/clang/test/CodeGenObjC/arc-blocks-exceptions.m
+++ b/clang/test/CodeGenObjC/arc-blocks-exceptions.m
@@ -5,17 +5,22 @@ void test1(_Bool c) {
   __weak id weakId = 0;
   test1_fn(c ? ^{ (void)weakId; } : 0);
 
-  // CHECK: [[CLEANUP_COND:%.*]] = alloca i1
-  // CHECK-NEXT: [[CLEANUP_SAVE:%.*]] = alloca ptr
+  // CHECK: [[CLEANUP_SAVE:%cond-cleanup.save.*]] = alloca ptr
+  // CHECK-NEXT: [[CLEANUP_COND:%.*]] = alloca i1
+  // CHECK-NEXT: [[CLEANUP_COND1:%.*]] = alloca i1
 
-  // CHECK: store i1 true, ptr [[CLEANUP_COND]]
-  // CHECK-NEXT: store ptr {{.*}}, ptr [[CLEANUP_SAVE]]
+  // CHECK: store i1 false, ptr [[CLEANUP_COND]]
+  // CHECK-NEXT: store i1 false, ptr [[CLEANUP_COND1]]
+
+  // CHECK: store ptr {{.*}}, ptr [[CLEANUP_SAVE]]
+  // CHECK-NEXT: store i1 true, ptr [[CLEANUP_COND]]
+  // CHECK-NEXT: store i1 true, ptr [[CLEANUP_COND1]]
 
   // CHECK: invoke void @test1_fn(
   // CHECK-NEXT: to label %[[INVOKE_CONT:.*]] unwind label %[[LANDING_PAD_LAB:.*]]
 
   // CHECK: [[INVOKE_CONT]]:
-  // CHECK-NEXT: [[LOAD:%.*]] = load i1, ptr [[CLEANUP_COND]]
+  // CHECK-NEXT: [[LOAD:%.*]] = load i1, ptr [[CLEANUP_COND1]]
   // CHECK-NEXT: br i1 [[LOAD]], label %[[END_OF_SCOPE_LAB:.*]], label
 
   // CHECK: [[END_OF_SCOPE_LAB]]:
diff --git a/clang/test/CodeGenObjC/arc-blocks.m b/clang/test/CodeGenObjC/arc-blocks.m
index 105a72b4af1e..f718e8bbf9a6 100644
--- a/clang/test/CodeGenObjC/arc-blocks.m
+++ b/clang/test/CodeGenObjC/arc-blocks.m
@@ -445,8 +445,8 @@ void test13(id x) {
   // CHECK:      [[X:%.*]] = alloca ptr, align 8
   // CHECK-NEXT: [[B:%.*]] = alloca ptr, align 8
   // CHECK-NEXT: [[BLOCK:%.*]] = alloca [[BLOCK_T:.*]], align 8
-  // CHECK-NEXT: [[CLEANUP_ACTIVE:%.*]] = alloca i1
   // CHECK-NEXT: [[COND_CLEANUP_SAVE:%.*]] = alloca ptr,
+  // CHECK-NEXT: [[CLEANUP_ACTIVE:%.*]] = alloca i1
   // CHECK-NEXT: [[T0:%.*]] = call ptr @llvm.objc.retain(ptr {{%.*}})
   // CHECK-NEXT: store ptr [[T0]], ptr [[X]], align 8
   // CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[B]])
@@ -460,8 +460,8 @@ void test13(id x) {
   // CHECK-NEXT: [[T0:%.*]] = load ptr, ptr [[X]], align 8
   // CHECK-NEXT: [[T1:%.*]] = call ptr @llvm.objc.retain(ptr [[T0]])
   // CHECK-NEXT: store ptr [[T1]], ptr [[CAPTURE]], align 8
-  // CHECK-NEXT: store i1 true, ptr [[CLEANUP_ACTIVE]]
   // CHECK-NEXT: store ptr [[CAPTURE]], ptr [[COND_CLEANUP_SAVE]], align 8
+  // CHECK-NEXT: store i1 true, ptr [[CLEANUP_ACTIVE]]
   // CHECK-NEXT: br label
   // CHECK:      br label
   // CHECK:      [[T0:%.*]] = phi ptr
diff --git a/clang/test/Driver/aarch64-mcpu.c b/clang/test/Driver/aarch64-mcpu.c
index 77ba43122b24..ad4a5f9ac6fb 100644
--- a/clang/test/Driver/aarch64-mcpu.c
+++ b/clang/test/Driver/aarch64-mcpu.c
@@ -64,10 +64,16 @@
 // NEOVERSE-V1: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-v1"
 // RUN: %clang --target=aarch64 -mcpu=neoverse-v2  -### -c %s 2>&1 | FileCheck -check-prefix=NEOVERSE-V2 %s
 // NEOVERSE-V2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-v2"
+// RUN: %clang --target=aarch64 -mcpu=neoverse-v3  -### -c %s 2>&1 | FileCheck -check-prefix=NEOVERSE-V3 %s
+// NEOVERSE-V3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-v3"
+// RUN: %clang --target=aarch64 -mcpu=neoverse-v3ae  -### -c %s 2>&1 | FileCheck -check-prefix=NEOVERSE-V3AE %s
+// NEOVERSE-V3AE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-v3ae"
 // RUN: %clang --target=aarch64 -mcpu=neoverse-n1 -### -c %s 2>&1 | FileCheck -check-prefix=NEOVERSE-N1 %s
 // NEOVERSE-N1: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-n1"
 // RUN: %clang --target=aarch64 -mcpu=neoverse-n2 -### -c %s 2>&1 | FileCheck -check-prefix=NEOVERSE-N2 %s
 // NEOVERSE-N2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-n2"
+// RUN: %clang --target=aarch64 -mcpu=neoverse-n3 -### -c %s 2>&1 | FileCheck -check-prefix=NEOVERSE-N3 %s
+// NEOVERSE-N3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-n3"
 // RUN: %clang --target=aarch64 -mcpu=neoverse-512tvb -### -c %s 2>&1 | FileCheck -check-prefix=NEOVERSE-512TVB %s
 // NEOVERSE-512TVB: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-512tvb"
 // RUN: %clang --target=aarch64 -mcpu=cortex-a520 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A520 %s
diff --git a/clang/test/Driver/amdgpu-toolchain.c b/clang/test/Driver/amdgpu-toolchain.c
index faaff05004f6..8ab6a0713147 100644
--- a/clang/test/Driver/amdgpu-toolchain.c
+++ b/clang/test/Driver/amdgpu-toolchain.c
@@ -27,4 +27,4 @@
 
 // RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 -nogpulib \
 // RUN:   -fuse-ld=ld %s 2>&1 | FileCheck -check-prefixes=LD %s
-// LD: ld.lld"
+// LD: ld.lld
diff --git a/clang/test/Driver/claim-unused.c b/clang/test/Driver/claim-unused.c
deleted file mode 100644
index c7b798934b3c..000000000000
--- a/clang/test/Driver/claim-unused.c
+++ /dev/null
@@ -1,3 +0,0 @@
-// RUN: touch %t.o
-// RUN: %clang --param ssp-buffer-size=1 %t.o -### 2>&1 | FileCheck %s
-// CHECK-NOT: warning: argument unused during compilation: '--param ssp-buffer-size=1'
diff --git a/clang/test/Driver/fp-model.c b/clang/test/Driver/fp-model.c
index 74b7de7a275a..a464729edb45 100644
--- a/clang/test/Driver/fp-model.c
+++ b/clang/test/Driver/fp-model.c
@@ -73,9 +73,8 @@
 
 // RUN: %clang -### -Ofast -ffp-model=strict -c %s 2>&1 | FileCheck \
 // RUN:   --check-prefix=WARN12 %s
-// RUN: %clang -### -ffast-math -ffp-model=strict -c %s 2>&1 | FileCheck \
-// RUN:   --check-prefix=WARN12 %s
-// WARN12-NOT: warning: overriding '-ffp-model=strict' option with '-ffp-model=strict' [-Woverriding-option]
+// RUN: %clang -### -Werror -ffast-math -ffp-model=strict -c %s
+// WARN12: warning: overriding '-ffp-model=strict' option with '-Ofast'
 
 // RUN: %clang -### -ffp-model=strict -fapprox-func -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=WARN13 %s
diff --git a/clang/test/Driver/gcc-param.c b/clang/test/Driver/gcc-param.c
new file mode 100644
index 000000000000..4672e1156ce7
--- /dev/null
+++ b/clang/test/Driver/gcc-param.c
@@ -0,0 +1,2 @@
+// RUN: touch %t.o
+// RUN: %clang -Werror --param ssp-buffer-size=1 %t.o -###
diff --git a/clang/test/Driver/hlsl-lang-targets-spirv.hlsl b/clang/test/Driver/hlsl-lang-targets-spirv.hlsl
index b86c2e01f8d8..61b10e1648c5 100644
--- a/clang/test/Driver/hlsl-lang-targets-spirv.hlsl
+++ b/clang/test/Driver/hlsl-lang-targets-spirv.hlsl
@@ -1,4 +1,5 @@
 // REQUIRES: spirv-registered-target
+// REQUIRES: directx-registered-target
 
 // Supported targets
 //
diff --git a/clang/test/Driver/linux-ld.c b/clang/test/Driver/linux-ld.c
index 958e682b6c3c..e2043ab22afc 100644
--- a/clang/test/Driver/linux-ld.c
+++ b/clang/test/Driver/linux-ld.c
@@ -2,11 +2,10 @@
 // General tests that ld invocations on Linux targets sane. Note that we use
 // sysroot to make these tests independent of the host system.
 //
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### -Werror %s -no-pie 2>&1 \
 // RUN:     --target=i386-unknown-linux -rtlib=platform --unwindlib=platform \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-32 %s
-// CHECK-LD-32-NOT: warning:
 // CHECK-LD-32: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-32: "{{.*}}/usr/lib/gcc/i386-unknown-linux/10.2.0{{/|\\\\}}crtbegin.o"
 // CHECK-LD-32: "-L[[SYSROOT]]/usr/lib/gcc/i386-unknown-linux/10.2.0"
@@ -14,11 +13,10 @@
 // CHECK-LD-32: "-L[[SYSROOT]]/lib"
 // CHECK-LD-32: "-L[[SYSROOT]]/usr/lib"
 //
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -rtlib=platform --unwindlib=platform \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-64 %s
-// CHECK-LD-64-NOT: warning:
 // CHECK-LD-64: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-64: "--eh-frame-hdr"
 // CHECK-LD-64: "-m" "elf_x86_64"
@@ -32,11 +30,10 @@
 // CHECK-LD-64: "-lc"
 // CHECK-LD-64: "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
 //
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=x86_64-unknown-linux-gnux32 -rtlib=platform --unwindlib=platform \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-X32 %s
-// CHECK-LD-X32-NOT: warning:
 // CHECK-LD-X32: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-X32: "--eh-frame-hdr"
 // CHECK-LD-X32: "-m" "elf32_x86_64"
@@ -45,13 +42,12 @@
 // CHECK-LD-X32: "-lc"
 // CHECK-LD-X32: "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
 //
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=x86_64-unknown-linux \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:     --rtlib=compiler-rt \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-RT %s
-// CHECK-LD-RT-NOT: warning:
 // CHECK-LD-RT: "-resource-dir" "[[RESDIR:[^"]*]]"
 // CHECK-LD-RT: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-RT: "--eh-frame-hdr"
@@ -67,13 +63,12 @@
 // CHECK-LD-RT: libclang_rt.builtins.a"
 // CHECK-LD-RT: "[[RESDIR]]{{/|\\\\}}lib{{/|\\\\}}x86_64-unknown-linux{{/|\\\\}}clang_rt.crtend.o"
 //
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=i686-unknown-linux \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:     --rtlib=compiler-rt \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-RT-I686 %s
-// CHECK-LD-RT-I686-NOT: warning:
 // CHECK-LD-RT-I686: "-resource-dir" "[[RESDIR:[^"]*]]"
 // CHECK-LD-RT-I686: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-RT-I686: "--eh-frame-hdr"
@@ -89,13 +84,12 @@
 // CHECK-LD-RT-I686: libclang_rt.builtins.a"
 // CHECK-LD-RT-I686: "[[RESDIR]]{{/|\\\\}}lib{{/|\\\\}}i686-unknown-linux{{/|\\\\}}clang_rt.crtend.o"
 //
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=arm-linux-androideabi \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --rtlib=compiler-rt \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-RT-ANDROID %s
-// CHECK-LD-RT-ANDROID-NOT: warning:
 // CHECK-LD-RT-ANDROID: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-RT-ANDROID: "--eh-frame-hdr"
 // CHECK-LD-RT-ANDROID: "-m" "armelf_linux_eabi"
@@ -104,11 +98,10 @@
 // CHECK-LD-RT-ANDROID: "-lc"
 // CHECK-LD-RT-ANDROID: libclang_rt.builtins.a"
 //
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -rtlib=platform --unwindlib=platform \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-GCC %s
-// CHECK-LD-GCC-NOT: warning:
 // CHECK-LD-GCC: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-GCC: "--eh-frame-hdr"
 // CHECK-LD-GCC: "-m" "elf_x86_64"
@@ -122,12 +115,11 @@
 // CHECK-LD-GCC: "-lc"
 // CHECK-LD-GCC: "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
 //
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -rtlib=platform --unwindlib=platform \
 // RUN:     -static-libgcc \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-64-STATIC-LIBGCC %s
-// CHECK-LD-64-STATIC-LIBGCC-NOT: warning:
 // CHECK-LD-64-STATIC-LIBGCC: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-64-STATIC-LIBGCC: "--eh-frame-hdr"
 // CHECK-LD-64-STATIC-LIBGCC: "-m" "elf_x86_64"
@@ -268,12 +260,10 @@
 // CHECK-CLANG-ANDROID-STATIC: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-CLANG-ANDROID-STATIC: "--start-group" "{{[^"]*}}{{/|\\\\}}libclang_rt.builtins.a" "-l:libunwind.a" "-lc" "--end-group"
 //
-// RUN: %clang -### %s 2>&1      \
-// RUN:     --target=x86_64-unknown-linux -rtlib=platform --unwindlib=platform \
+// RUN: %clang -### %s -Werror --target=x86_64-unknown-linux -rtlib=platform --unwindlib=platform \
 // RUN:     -static \
-// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-64-STATIC %s
-// CHECK-LD-64-STATIC-NOT: warning:
 // CHECK-LD-64-STATIC: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-64-STATIC: "--eh-frame-hdr"
 // CHECK-LD-64-STATIC: "-m" "elf_x86_64"
@@ -486,13 +476,12 @@
 //
 // Test that we can use -stdlib=libc++ in a build system even when it
 // occasionally links C code instead of C++ code.
-// RUN: %clang -x c -### %s -no-pie 2>&1 \
+// RUN: %clang -x c -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=x86_64-unknown-linux-gnu \
 // RUN:     -stdlib=libc++ \
 // RUN:     -ccc-install-dir %S/Inputs/basic_linux_libcxx_tree/usr/bin \
 // RUN:     --sysroot=%S/Inputs/basic_linux_libcxx_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BASIC-LIBCXX-C-LINK %s
-// CHECK-BASIC-LIBCXX-C-LINK-NOT: warning:
 // CHECK-BASIC-LIBCXX-C-LINK: "-cc1"
 // CHECK-BASIC-LIBCXX-C-LINK: "-isysroot" "[[SYSROOT:[^"]+]]"
 // CHECK-BASIC-LIBCXX-C-LINK-NOT: "-internal-isystem" "[[SYSROOT]]/usr/bin/../include/c++/v1"
@@ -1661,11 +1650,10 @@
 // CHECK-MUSL-AARCH64_BE: "-dynamic-linker" "/lib/ld-musl-aarch64_be.so.1"
 
 // Check whether multilib gcc install works fine on Gentoo with gcc-config
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=x86_64-unknown-linux-gnu -rtlib=platform --unwindlib=platform \
 // RUN:     --sysroot=%S/Inputs/gentoo_linux_gcc_multi_version_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-GENTOO %s
-// CHECK-LD-GENTOO-NOT: warning:
 // CHECK-LD-GENTOO: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-GENTOO: "--eh-frame-hdr"
 // CHECK-LD-GENTOO: "-m" "elf_x86_64"
@@ -1676,11 +1664,10 @@
 // CHECK-LD-GENTOO: "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
 // CHECK-LD-GENTOO: "-lc"
 // CHECK-LD-GENTOO: "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=i686-unknown-linux-gnu -rtlib=platform --unwindlib=platform \
 // RUN:     --sysroot=%S/Inputs/gentoo_linux_gcc_multi_version_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-GENTOO-32 %s
-// CHECK-LD-GENTOO-32-NOT: warning:
 // CHECK-LD-GENTOO-32: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-GENTOO-32: "--eh-frame-hdr"
 // CHECK-LD-GENTOO-32: "-m" "elf_i386"
@@ -1691,11 +1678,10 @@
 // CHECK-LD-GENTOO-32: "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
 // CHECK-LD-GENTOO-32: "-lc"
 // CHECK-LD-GENTOO-32: "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=x86_64-unknown-linux-gnux32 -rtlib=platform --unwindlib=platform \
 // RUN:     --sysroot=%S/Inputs/gentoo_linux_gcc_multi_version_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-GENTOO-X32 %s
-// CHECK-LD-GENTOO-X32-NOT: warning:
 // CHECK-LD-GENTOO-X32: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-GENTOO-X32: "--eh-frame-hdr"
 // CHECK-LD-GENTOO-X32: "-m" "elf32_x86_64"
@@ -1717,11 +1703,10 @@
 // CHECK-LD-RHEL7-DTS: [[SYSROOT]]/usr/lib/gcc/x86_64-redhat-linux/7/../../../../bin/ld
 
 // Check whether gcc7 install works fine on Amazon Linux AMI
-// RUN: %clang -### %s -no-pie 2>&1 \
+// RUN: %clang -### %s -Werror -no-pie 2>&1 \
 // RUN:     --target=x86_64-amazon-linux -rtlib=libgcc --unwindlib=platform \
 // RUN:     --sysroot=%S/Inputs/ami_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-AMI %s
-// CHECK-LD-AMI-NOT: warning:
 // CHECK-LD-AMI: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-LD-AMI: "--eh-frame-hdr"
 // CHECK-LD-AMI: "-m" "elf_x86_64"
diff --git a/clang/test/Driver/riscv-arch.c b/clang/test/Driver/riscv-arch.c
index 8399b4e97f86..abbe8612b378 100644
--- a/clang/test/Driver/riscv-arch.c
+++ b/clang/test/Driver/riscv-arch.c
@@ -209,7 +209,7 @@
 // RUN: not %clang --target=riscv32-unknown-elf -march=rv32q -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-LETTER %s
 // RV32-LETTER: error: invalid arch name 'rv32q',
-// RV32-LETTER: first letter should be 'e', 'i' or 'g'
+// RV32-LETTER: first letter after 'rv32' should be 'e', 'i' or 'g'
 
 // RUN: not %clang --target=riscv32-unknown-elf -march=rv32imcq -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-ORDER %s
@@ -239,12 +239,12 @@
 // RUN: not %clang --target=riscv32-unknown-elf -march=rv32xabc -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32X %s
 // RV32X: error: invalid arch name 'rv32xabc',
-// RV32X: first letter should be 'e', 'i' or 'g'
+// RV32X: first letter after 'rv32' should be 'e', 'i' or 'g'
 
 // RUN: not %clang --target=riscv32-unknown-elf -march=rv32sabc -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32S %s
 // RV32S: error: invalid arch name 'rv32sabc',
-// RV32S: first letter should be 'e', 'i' or 'g'
+// RV32S: first letter after 'rv32' should be 'e', 'i' or 'g'
 
 // RUN: not %clang --target=riscv32-unknown-elf -march=rv32ix -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32X-NAME %s
diff --git a/clang/test/Driver/wasm-features.c b/clang/test/Driver/wasm-features.c
index 5dae5dbc89b9..1f7fb2134982 100644
--- a/clang/test/Driver/wasm-features.c
+++ b/clang/test/Driver/wasm-features.c
@@ -77,6 +77,12 @@
 // RELAXED-SIMD: "-target-feature" "+relaxed-simd"
 // NO-RELAXED-SIMD: "-target-feature" "-relaxed-simd"
 
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -mhalf-precision 2>&1 | FileCheck %s -check-prefix=HALF-PRECISION
+// RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-half-precision 2>&1 | FileCheck %s -check-prefix=NO-HALF-PRECISION
+
+// HALF-PRECISION: "-target-feature" "+half-precision"
+// NO-HALF-PRECISION: "-target-feature" "-half-precision"
+
 // RUN: %clang --target=wasm32-unknown-unknown -### %s -mexception-handling 2>&1 | FileCheck %s -check-prefix=EXCEPTION-HANDLING
 // RUN: %clang --target=wasm32-unknown-unknown -### %s -mno-exception-handling 2>&1 | FileCheck %s -check-prefix=NO-EXCEPTION-HANDLING
 
diff --git a/clang/test/Index/annotate-nested-name-specifier.cpp b/clang/test/Index/annotate-nested-name-specifier.cpp
index 318149725840..a7338db6b05b 100644
--- a/clang/test/Index/annotate-nested-name-specifier.cpp
+++ b/clang/test/Index/annotate-nested-name-specifier.cpp
@@ -132,7 +132,7 @@ struct X8 {
 
 struct X9 : X8 {
   typedef X8 inherited;
-  void f() {
+  void f() { 
     inherited::f();
   }
 };
@@ -299,7 +299,7 @@ struct X9 : X8 {
 // CHECK: Identifier: "type" [77:16 - 77:20] TypeRef=X4::type:70:13
 // CHECK: Punctuation: ">" [77:20 - 77:21] MemberRefExpr=
 // CHECK: Punctuation: "::" [77:21 - 77:23] MemberRefExpr=
-// CHECK: Identifier: "g" [77:23 - 77:24] OverloadedDeclRef=
+// CHECK: Identifier: "g" [77:23 - 77:24] MemberRefExpr=
 // CHECK: Punctuation: "(" [77:24 - 77:25] CallExpr=
 // CHECK: Identifier: "t" [77:25 - 77:26] DeclRefExpr=t:74:12
 // CHECK: Punctuation: ")" [77:26 - 77:27] CallExpr=
diff --git a/clang/test/Lexer/cxx-features.cpp b/clang/test/Lexer/cxx-features.cpp
index baaa9d4434e9..4a08eb61cd39 100644
--- a/clang/test/Lexer/cxx-features.cpp
+++ b/clang/test/Lexer/cxx-features.cpp
@@ -222,7 +222,7 @@
 #error "wrong value for __cpp_aggregate_bases"
 #endif
 
-#if check(structured_bindings, 0, 0, 0, 201606, 201606, 201606, 201606)
+#if check(structured_bindings, 0, 0, 0, 202403L, 202403L, 202403L, 202403L)
 #error "wrong value for __cpp_structured_bindings"
 #endif
 
diff --git a/clang/test/Lexer/update_consecutive_macro_address_space.c b/clang/test/Lexer/update_consecutive_macro_address_space.c
index 80ef4557591c..6f74709556c0 100644
--- a/clang/test/Lexer/update_consecutive_macro_address_space.c
+++ b/clang/test/Lexer/update_consecutive_macro_address_space.c
@@ -1,14 +1,15 @@
 // RUN: %clang -cc1 -print-stats %s 2>&1 | FileCheck %s
-// CHECK: 6 local SLocEntries allocated
+// CHECK: 7 local SLocEntries allocated
 //
-// Verify that the macro arg expansion is split to two file ids, we have 6 file
-// ids rather than 5:
+// Verify that the macro arg expansion is split to two file ids, we have 7 file
+// ids rather than 6:
 //   0: invalid file id
 //   1: main file
 //   2: builtin file
-//   3: macro expansion for X
-//   4: macro arg expansions for 1
-//   5: macro arg expansions for == 2
+//   3: scratch space for __GCC_[CON|DE]STRUCTIVE_SIZE macros
+//   4: macro expansion for X
+//   5: macro arg expansions for 1
+//   6: macro arg expansions for == 2
 #define X(x) (int)(x);
 void func() {
   X(1
diff --git a/clang/test/Misc/cc1as-relax-all.s b/clang/test/Misc/cc1as-relax-all.s
new file mode 100644
index 000000000000..e76fc6f61bab
--- /dev/null
+++ b/clang/test/Misc/cc1as-relax-all.s
@@ -0,0 +1,13 @@
+// REQUIRES: x86-registered-target
+// RUN: %clang -cc1as -triple x86_64 -filetype obj -mrelax-all %s -o %t.o
+// RUN: llvm-objdump -d %t.o | FileCheck %s
+
+// CHECK:      <.text>:
+// CHECK-NEXT:   0: e9 06 00 00 00                jmp     0xb <foo>
+// CHECK-NEXT:   5: 0f 84 00 00 00 00             je      0xb <foo>
+// CHECK-EMPTY:
+
+jmp foo
+je foo
+
+foo: ret
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index 9c91c4157cd6..21d80b713450 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -5,11 +5,11 @@
 
 // RUN: not %clang_cc1 -triple arm64--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix AARCH64
 // AARCH64: error: unknown target CPU 'not-a-cpu'
-// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a520ae, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-a720ae, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
+// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a520ae, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-a720ae, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-n3, neoverse-512tvb, neoverse-v1, neoverse-v2, neoverse-v3, neoverse-v3ae, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
 
 // RUN: not %clang_cc1 -triple arm64--- -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE_AARCH64
 // TUNE_AARCH64: error: unknown target CPU 'not-a-cpu'
-// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a520ae, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-a720ae, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
+// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a520ae, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-a720ae, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-n3, neoverse-512tvb, neoverse-v1, neoverse-v2, neoverse-v3, neoverse-v3ae, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
 
 // RUN: not %clang_cc1 -triple i386--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix X86
 // X86: error: unknown target CPU 'not-a-cpu'
diff --git a/clang/test/Modules/pr88400.cppm b/clang/test/Modules/pr88400.cppm
new file mode 100644
index 000000000000..ff69137a0b90
--- /dev/null
+++ b/clang/test/Modules/pr88400.cppm
@@ -0,0 +1,61 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/bar.cppm -emit-module-interface -o %t/bar.pcm
+// RUN: %clang_cc1 -std=c++20 %t/foo.cc -fmodule-file=bar=%t/bar.pcm -fsyntax-only -verify
+// RUN: %clang_cc1 -std=c++20 %t/bar.cc -fmodule-file=bar=%t/bar.pcm -fsyntax-only -verify
+//
+// RUN: %clang_cc1 -std=c++20 %t/bar.cppm -emit-reduced-module-interface -o %t/bar.pcm
+// RUN: %clang_cc1 -std=c++20 %t/foo.cc -fmodule-file=bar=%t/bar.pcm -fsyntax-only -verify
+// RUN: %clang_cc1 -std=c++20 %t/bar.cc -fmodule-file=bar=%t/bar.pcm -fsyntax-only -verify
+
+//--- header.h
+#pragma once
+
+namespace N {
+    template<typename T>
+    concept X = true;
+
+    template<X T>
+    class Y {
+    public:
+        template<X U>
+        friend class Y;
+    };
+
+    inline Y<int> x;
+}
+
+//--- bar.cppm
+module;
+
+#include "header.h"
+
+export module bar;
+
+namespace N {
+    // To make sure N::Y won't get elided.
+    using N::x;
+}
+
+//--- foo.cc
+// expected-no-diagnostics
+#include "header.h"
+
+import bar;
+
+void y() {
+    N::Y<int> y{};
+};
+
+//--- bar.cc
+// expected-no-diagnostics
+import bar;
+
+#include "header.h"
+
+void y() {
+    N::Y<int> y{};
+};
+
diff --git a/clang/test/Parser/cxx1z-decomposition.cpp b/clang/test/Parser/cxx1z-decomposition.cpp
index 90d60df2e47f..4b17f72effb0 100644
--- a/clang/test/Parser/cxx1z-decomposition.cpp
+++ b/clang/test/Parser/cxx1z-decomposition.cpp
@@ -1,6 +1,7 @@
-// RUN: %clang_cc1 -std=c++17 %s -verify=expected,cxx17 -fcxx-exceptions
-// RUN: %clang_cc1 -std=c++2b %s -verify=expected,cxx2b -fcxx-exceptions
-// RUN: not %clang_cc1 -std=c++17 %s -emit-llvm-only -fcxx-exceptions
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-unknown-linux-gnu -verify=expected,cxx17,pre2c -fcxx-exceptions
+// RUN: %clang_cc1 -std=c++2b %s -triple x86_64-unknown-linux-gnu -verify=expected,cxx2b,pre2c,post2b -fcxx-exceptions
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-unknown-linux-gnu -verify=expected,cxx2c,post2b -fcxx-exceptions
+// RUN: not %clang_cc1 -std=c++17 %s -triple x86_64-unknown-linux-gnu -emit-llvm-only -fcxx-exceptions
 
 struct S { int a, b, c; };
 
@@ -58,7 +59,7 @@ namespace OtherDecl {
 namespace GoodSpecifiers {
   void f() {
     int n[1];
-    const volatile auto &[a] = n; // cxx2b-warning {{volatile qualifier in structured binding declaration is deprecated}}
+    const volatile auto &[a] = n; // post2b-warning {{volatile qualifier in structured binding declaration is deprecated}}
   }
 }
 
@@ -97,8 +98,8 @@ namespace BadSpecifiers {
     S [a] = s; // expected-error {{cannot be declared with type 'S'}}
     decltype(auto) [b] = s; // expected-error {{cannot be declared with type 'decltype(auto)'}}
     auto ([c2]) = s; // cxx17-error {{decomposition declaration cannot be declared with parenthese}} \
-                     // cxx2b-error {{use of undeclared identifier 'c2'}} \
-                     // cxx2b-error {{expected body of lambda expression}} \
+                     // post2b-error {{use of undeclared identifier 'c2'}} \
+                     // post2b-error {{expected body of lambda expression}} \
 
     // FIXME: This error is not very good.
     auto [d]() = s; // expected-error {{expected ';'}} expected-error {{expected expression}}
@@ -119,9 +120,6 @@ namespace BadSpecifiers {
     [[]] auto [ok_3] = s;
     alignas(S) auto [ok_4] = s;
 
-    // ... but not after the identifier or declarator.
-    // FIXME: These errors are not very good.
-    auto [bad_attr_1 [[]]] = s; // expected-error {{attribute list cannot appear here}} expected-error 2{{}}
     auto [bad_attr_2] [[]] = s; // expected-error {{expected ';'}} expected-error {{}}
   }
 }
@@ -156,3 +154,50 @@ namespace Init {
     S [goodish4] { 4 }; // expected-error {{cannot be declared with type 'S'}}
   }
 }
+
+
+namespace attributes {
+
+struct S{
+    int a;
+    int b = 0;
+};
+
+void err() {
+    auto [[]] = S{0}; // expected-error {{expected unqualified-id}}
+    auto [ alignas(42) a, foo ] = S{0}; // expected-error {{an attribute list cannot appear here}}
+    auto [ c, [[]] d ] = S{0}; // expected-error {{an attribute list cannot appear here}}
+    auto [ e, alignas(42) f ] = S{0}; // expected-error {{an attribute list cannot appear here}}
+}
+
+void ok() {
+    auto [ a alignas(42) [[]], b alignas(42) [[]]] = S{0}; // expected-error 2{{'alignas' attribute only applies to variables, data members and tag types}} \
+                                                           // pre2c-warning  2{{an attribute specifier sequence attached to a structured binding declaration is a C++2c extension}}
+    auto [ c [[]] alignas(42), d [[]] alignas(42) [[]]] = S{0}; // expected-error 2{{'alignas' attribute only applies to variables, data members and tag types}} \
+                                                                // pre2c-warning  2{{an attribute specifier sequence attached to a structured binding declaration is a C++2c extension}}
+}
+
+
+auto [G1 [[deprecated]], G2 [[deprecated]]] = S{42}; // #deprecated-here
+// pre2c-warning@-1 2{{an attribute specifier sequence attached to a structured binding declaration is a C++2c extension}}
+
+int test() {
+  return G1 + G2; // expected-warning {{'G1' is deprecated}} expected-note@#deprecated-here {{here}} \
+                  // expected-warning {{'G2' is deprecated}} expected-note@#deprecated-here {{here}}
+}
+
+void invalid_attributes() {
+  // pre2c-warning@+1 {{an attribute specifier sequence attached to a structured binding declaration is a C++2c extension}}
+  auto [a alignas(42) // expected-error {{'alignas' attribute only applies to variables, data members and tag types}}
+      [[assume(true), // expected-error {{'assume' attribute cannot be applied to a declaration}}
+        carries_dependency, // expected-error {{'carries_dependency' attribute only applies to parameters, Objective-C methods, and functions}}
+        fallthrough,  // expected-error {{'fallthrough' attribute cannot be applied to a declaration}}
+        likely, // expected-error {{'likely' attribute cannot be applied to a declaration}}
+        unlikely, // expected-error {{'unlikely' attribute cannot be applied to a declaration}}
+        nodiscard,  // expected-warning {{'nodiscard' attribute only applies to Objective-C methods, enums, structs, unions, classes, functions, function pointers, and typedefs}}
+        noreturn,  // expected-error {{'noreturn' attribute only applies to functions}}
+        no_unique_address]], // expected-error {{'no_unique_address' attribute only applies to non-bit-field non-static data members}}
+    b] = S{0};
+}
+
+}
diff --git a/clang/test/Parser/pragma-unroll.cpp b/clang/test/Parser/pragma-unroll.cpp
index f41bd7a18d5a..19066acddcef 100644
--- a/clang/test/Parser/pragma-unroll.cpp
+++ b/clang/test/Parser/pragma-unroll.cpp
@@ -124,3 +124,32 @@ void test(int *List, int Length) {
 
 #pragma unroll
 /* expected-error {{expected statement}} */ }
+
+using size_t = unsigned long long;
+
+template <bool Flag>
+int FailToBuild(int n) {
+  constexpr int N = 100;
+  auto init = [=]() { return Flag ? n : 0UL; };
+  auto cond = [=](size_t ix) { return Flag ? ix != 0 : ix < 10; };
+  auto iter = [=](size_t ix) {
+    return Flag ? ix & ~(1ULL << __builtin_clzll(ix)) : ix + 1;
+  };
+#pragma unroll Flag ? 0 : N // Ok, allow 0.
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    n *= n;
+  }
+#pragma GCC unroll Flag ? 0 : N // Ok, allow 0.
+  for (size_t ix = init(); cond(ix); ix = iter(ix)) {
+    n *= n;
+  }
+  return n;
+}
+
+int foo(int n) {
+    return FailToBuild<true>(n);
+}
+
+int bar(int n) {
+    return FailToBuild<false>(n);
+}
diff --git a/clang/test/ParserOpenACC/parse-cache-construct.cpp b/clang/test/ParserOpenACC/parse-cache-construct.cpp
index 1ab2153a68be..f1c71e8b5847 100644
--- a/clang/test/ParserOpenACC/parse-cache-construct.cpp
+++ b/clang/test/ParserOpenACC/parse-cache-construct.cpp
@@ -72,8 +72,6 @@ void use() {
     #pragma acc cache(Arrs.MemArr[3].array[1:4])
   }
   for (int i = 0; i < 10; ++i) {
-    // FIXME: Once we have a new array-section type to represent OpenACC as
-    // well, change this error message.
     // expected-error@+2{{OpenACC sub-array is not allowed here}}
     // expected-warning@+1{{OpenACC construct 'cache' not yet implemented, pragma ignored}}
     #pragma acc cache(Arrs.MemArr[3:4].array[1:4])
diff --git a/clang/test/Preprocessor/hardware_interference.cpp b/clang/test/Preprocessor/hardware_interference.cpp
new file mode 100644
index 000000000000..f3727aadd32a
--- /dev/null
+++ b/clang/test/Preprocessor/hardware_interference.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -E -dM -D__GCC_CONSTRUCTIVE_SIZE=1000 -D__GCC_DESTRUCTIVE_SIZE=1001 %s -verify -Weverything | FileCheck %s
+// RUN: %clang_cc1 -D__GCC_CONSTRUCTIVE_SIZE=1000 -D__GCC_DESTRUCTIVE_SIZE=1001 %s -verify -Weverything
+// RUN: %clang_cc1 -E -dM -U__GCC_CONSTRUCTIVE_SIZE -U__GCC_DESTRUCTIVE_SIZE %s -verify -Weverything | FileCheck --check-prefix DISABLED %s
+// expected-no-diagnostics
+
+// Validate that we can set a new value on the command line without issuing any
+// diagnostics and that we can disabled the macro on the command line without
+// issuing any diagnostics.
+
+// CHECK: #define __GCC_CONSTRUCTIVE_SIZE 1000
+// CHECK: #define __GCC_DESTRUCTIVE_SIZE 1001
+// DISABLED-NOT: __GCC_CONSTRUCTIVE_SIZE
+// DISABLED-NOT: __GCC_DESTRUCTIVE_SIZE
+
+int main() {
+  return 0;
+}
diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c
index cf96870b27ac..f0845985c9ef 100644
--- a/clang/test/Preprocessor/init-aarch64.c
+++ b/clang/test/Preprocessor/init-aarch64.c
@@ -119,6 +119,8 @@
 // AARCH64-NEXT: #define __FP_FAST_FMA 1
 // AARCH64-NEXT: #define __FP_FAST_FMAF 1
 // AARCH64-NEXT: #define __GCC_ASM_FLAG_OUTPUTS__ 1
+// AARCH64-NEXT: #define __GCC_CONSTRUCTIVE_SIZE {{.+}}
+// AARCH64-NEXT: #define __GCC_DESTRUCTIVE_SIZE {{.+}}
 // AARCH64-NEXT: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
 // AARCH64-NEXT: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 1
 // AARCH64-NEXT: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
@@ -220,11 +222,11 @@
 // AARCH64-NEXT: #define __LONG_MAX__ 9223372036854775807L
 // AARCH64-NEXT: #define __LONG_WIDTH__ 64
 // AARCH64-NEXT: #define __LP64__ 1
-// AARCH64-NEXT: #define __MEMORY_SCOPE_DEVICE 1 
-// AARCH64-NEXT: #define __MEMORY_SCOPE_SINGLE 4 
-// AARCH64-NEXT: #define __MEMORY_SCOPE_SYSTEM 0 
-// AARCH64-NEXT: #define __MEMORY_SCOPE_WRKGRP 2 
-// AARCH64-NEXT: #define __MEMORY_SCOPE_WVFRNT 3 
+// AARCH64-NEXT: #define __MEMORY_SCOPE_DEVICE 1
+// AARCH64-NEXT: #define __MEMORY_SCOPE_SINGLE 4
+// AARCH64-NEXT: #define __MEMORY_SCOPE_SYSTEM 0
+// AARCH64-NEXT: #define __MEMORY_SCOPE_WRKGRP 2
+// AARCH64-NEXT: #define __MEMORY_SCOPE_WVFRNT 3
 // AARCH64-NEXT: #define __NO_INLINE__ 1
 // AARCH64-NEXT: #define __NO_MATH_ERRNO__ 1
 // AARCH64-NEXT: #define __OBJC_BOOL_IS_BOOL 0
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index c4a55efca6f7..2641fee94023 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -1,3 +1,10 @@
+// RUN: %clang_cc1 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix INTERFERENCE %s
+//
+// We purposefully do not test the values produced, only that the macros are
+// predefined to some value.
+// INTERFERENCE:#define __GCC_CONSTRUCTIVE_SIZE {{.+}}
+// INTERFERENCE:#define __GCC_DESTRUCTIVE_SIZE {{.+}}
+
 // RUN: %clang_cc1 -E -dM -x assembler-with-cpp < /dev/null | FileCheck -match-full-lines -check-prefix ASM %s
 //
 // ASM:#define __ASSEMBLER__ 1
@@ -1697,6 +1704,8 @@
 // WEBASSEMBLY-NEXT:#define __GCC_ATOMIC_SHORT_LOCK_FREE 2
 // WEBASSEMBLY-NEXT:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
 // WEBASSEMBLY-NEXT:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2
+// WEBASSEMBLY-NEXT:#define __GCC_CONSTRUCTIVE_SIZE {{.+}}
+// WEBASSEMBLY-NEXT:#define __GCC_DESTRUCTIVE_SIZE {{.+}}
 // WEBASSEMBLY-NEXT:#define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
 // WEBASSEMBLY-NEXT:#define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
 // WEBASSEMBLY-NEXT:#define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
@@ -1806,11 +1815,11 @@
 // WEBASSEMBLY64-NEXT:#define __LONG_MAX__ 9223372036854775807L
 // WEBASSEMBLY64-NEXT:#define __LONG_WIDTH__ 64
 // WEBASSEMBLY64-NEXT:#define __LP64__ 1
-// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_DEVICE 1 
-// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_SINGLE 4 
-// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_SYSTEM 0 
-// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_WRKGRP 2 
-// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_WVFRNT 3 
+// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_DEVICE 1
+// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_SINGLE 4
+// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_SYSTEM 0
+// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_WRKGRP 2
+// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_WVFRNT 3
 // WEBASSEMBLY-NEXT:#define __NO_INLINE__ 1
 // WEBASSEMBLY-NEXT:#define __NO_MATH_ERRNO__ 1
 // WEBASSEMBLY-NEXT:#define __OBJC_BOOL_IS_BOOL 0
@@ -2126,11 +2135,11 @@
 // AVR:#define __LDBL_MIN__ 1.17549435e-38L
 // AVR:#define __LONG_LONG_MAX__ 9223372036854775807LL
 // AVR:#define __LONG_MAX__ 2147483647L
-// AVR:#define __MEMORY_SCOPE_DEVICE 1 
-// AVR:#define __MEMORY_SCOPE_SINGLE 4 
-// AVR:#define __MEMORY_SCOPE_SYSTEM 0 
-// AVR:#define __MEMORY_SCOPE_WRKGRP 2 
-// AVR:#define __MEMORY_SCOPE_WVFRNT 3 
+// AVR:#define __MEMORY_SCOPE_DEVICE 1
+// AVR:#define __MEMORY_SCOPE_SINGLE 4
+// AVR:#define __MEMORY_SCOPE_SYSTEM 0
+// AVR:#define __MEMORY_SCOPE_WRKGRP 2
+// AVR:#define __MEMORY_SCOPE_WVFRNT 3
 // AVR:#define __NO_INLINE__ 1
 // AVR:#define __ORDER_BIG_ENDIAN__ 4321
 // AVR:#define __ORDER_LITTLE_ENDIAN__ 1234
@@ -2422,11 +2431,11 @@
 // RISCV32: #define __LITTLE_ENDIAN__ 1
 // RISCV32: #define __LONG_LONG_MAX__ 9223372036854775807LL
 // RISCV32: #define __LONG_MAX__ 2147483647L
-// RISCV32: #define __MEMORY_SCOPE_DEVICE 1 
-// RISCV32: #define __MEMORY_SCOPE_SINGLE 4 
-// RISCV32: #define __MEMORY_SCOPE_SYSTEM 0 
-// RISCV32: #define __MEMORY_SCOPE_WRKGRP 2 
-// RISCV32: #define __MEMORY_SCOPE_WVFRNT 3 
+// RISCV32: #define __MEMORY_SCOPE_DEVICE 1
+// RISCV32: #define __MEMORY_SCOPE_SINGLE 4
+// RISCV32: #define __MEMORY_SCOPE_SYSTEM 0
+// RISCV32: #define __MEMORY_SCOPE_WRKGRP 2
+// RISCV32: #define __MEMORY_SCOPE_WVFRNT 3
 // RISCV32: #define __NO_INLINE__ 1
 // RISCV32: #define __POINTER_WIDTH__ 32
 // RISCV32: #define __PRAGMA_REDEFINE_EXTNAME 1
@@ -2634,11 +2643,11 @@
 // RISCV64: #define __LONG_LONG_MAX__ 9223372036854775807LL
 // RISCV64: #define __LONG_MAX__ 9223372036854775807L
 // RISCV64: #define __LP64__ 1
-// RISCV64: #define __MEMORY_SCOPE_DEVICE 1 
-// RISCV64: #define __MEMORY_SCOPE_SINGLE 4 
-// RISCV64: #define __MEMORY_SCOPE_SYSTEM 0 
-// RISCV64: #define __MEMORY_SCOPE_WRKGRP 2 
-// RISCV64: #define __MEMORY_SCOPE_WVFRNT 3 
+// RISCV64: #define __MEMORY_SCOPE_DEVICE 1
+// RISCV64: #define __MEMORY_SCOPE_SINGLE 4
+// RISCV64: #define __MEMORY_SCOPE_SYSTEM 0
+// RISCV64: #define __MEMORY_SCOPE_WRKGRP 2
+// RISCV64: #define __MEMORY_SCOPE_WVFRNT 3
 // RISCV64: #define __NO_INLINE__ 1
 // RISCV64: #define __POINTER_WIDTH__ 64
 // RISCV64: #define __PRAGMA_REDEFINE_EXTNAME 1
diff --git a/clang/test/Preprocessor/predefined-win-macros.c b/clang/test/Preprocessor/predefined-win-macros.c
index b830dc39d477..14e2f584bd09 100644
--- a/clang/test/Preprocessor/predefined-win-macros.c
+++ b/clang/test/Preprocessor/predefined-win-macros.c
@@ -3,7 +3,7 @@
 // RUN: %clang_cc1 %s -x c++ -E -dM -triple x86_64-pc-win32 -fms-extensions -fms-compatibility \
 // RUN:     -fms-compatibility-version=19.00 -std=c++14 -o - | FileCheck -match-full-lines %s --check-prefix=CHECK-MS64
 // RUN: %clang_cc1 %s -x c++ -E -dM -triple x86_64-pc-win32 -fms-extensions -fms-compatibility \
-// RUN:     -fms-compatibility-version=19.00 -std=c++14 -o - | grep GCC | count 5
+// RUN:     -fms-compatibility-version=19.00 -std=c++14 -o - | grep GCC | count 7
 // CHECK-MS64: #define _INTEGRAL_MAX_BITS 64
 // CHECK-MS64: #define _ISO_VOLATILE 1
 // CHECK-MS64: #define _MSC_EXTENSIONS 1
@@ -26,7 +26,7 @@
 // RUN: %clang_cc1 %s -x c++ -E -dM -triple i686-pc-win32 -fms-extensions -fms-compatibility \
 // RUN:     -fms-compatibility-version=19.00 -std=c++17 -o - | FileCheck -match-full-lines %s --check-prefix=CHECK-MS
 // RUN: %clang_cc1 %s -x c++ -E -dM -triple i686-pc-win32 -fms-extensions -fms-compatibility \
-// RUN:     -fms-compatibility-version=19.00 -std=c++17 -o - | grep GCC | count 5
+// RUN:     -fms-compatibility-version=19.00 -std=c++17 -o - | grep GCC | count 7
 // CHECK-MS: #define _INTEGRAL_MAX_BITS 64
 // CHECK-MS: #define _ISO_VOLATILE 1
 // CHECK-MS: #define _MSC_EXTENSIONS 1
@@ -39,6 +39,8 @@
 // CHECK-MS-NOT: GNU
 // CHECK-MS-NOT: GXX
 // CHECK-MS: #define __GCC_ASM_FLAG_OUTPUTS__ 1
+// CHECK-MS: #define __GCC_CONSTRUCTIVE_SIZE {{.+}}
+// CHECK-MS: #define __GCC_DESTRUCTIVE_SIZE {{.+}}
 // CHECK-MS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
 // CHECK-MS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
 // CHECK-MS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
diff --git a/clang/test/Sema/constant_builtins_vector.cpp b/clang/test/Sema/constant_builtins_vector.cpp
new file mode 100644
index 000000000000..68620d436fc4
--- /dev/null
+++ b/clang/test/Sema/constant_builtins_vector.cpp
@@ -0,0 +1,723 @@
+// RUN: %clang_cc1 -verify -std=c++2a -fsyntax-only -Wno-bit-int-extension %s
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define LITTLE_END 1
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define LITTLE_END 0
+#else
+#error "huh?"
+#endif
+
+// We also support _BitInt as long as it is >=8 and a power of 2.
+typedef _BitInt(8) BitInt8;
+typedef _BitInt(32) BitInt32;
+typedef _BitInt(128) BitInt128;
+
+typedef double vector4double __attribute__((__vector_size__(32)));
+typedef float vector4float __attribute__((__vector_size__(16)));
+typedef long long vector4long __attribute__((__vector_size__(32)));
+typedef int vector4int __attribute__((__vector_size__(16)));
+typedef short vector4short __attribute__((__vector_size__(8)));
+typedef char vector4char __attribute__((__vector_size__(4)));
+typedef BitInt8 vector4BitInt8 __attribute__((__vector_size__(4)));
+typedef BitInt32 vector4BitInt32 __attribute__((__vector_size__(16)));
+typedef BitInt128 vector4BitInt128 __attribute__((__vector_size__(64)));
+typedef double vector8double __attribute__((__vector_size__(64)));
+typedef float vector8float __attribute__((__vector_size__(32)));
+typedef long long vector8long __attribute__((__vector_size__(64)));
+typedef int vector8int __attribute__((__vector_size__(32)));
+typedef short vector8short __attribute__((__vector_size__(16)));
+typedef char vector8char __attribute__((__vector_size__(8)));
+typedef BitInt8 vector8BitInt8 __attribute__((__vector_size__(8)));
+typedef BitInt32 vector8BitInt32 __attribute__((__vector_size__(32)));
+typedef BitInt128 vector8BitInt128 __attribute__((__vector_size__(128)));
+
+#define CHECK_NUM(__size, __typeFrom, __typeTo, ...)                            \
+  constexpr vector##__size##__typeTo                                            \
+      from_##vector##__size##__typeFrom##_to_##vector##__size##__typeTo##_var = \
+          __builtin_convertvector((vector##__size##__typeFrom){__VA_ARGS__},    \
+                                  vector##__size##__typeTo);
+#define CHECK_TO_ALL_TYPES(__size, __typeFrom, ...)                            \
+  CHECK_NUM(__size, __typeFrom, double, __VA_ARGS__)                           \
+  CHECK_NUM(__size, __typeFrom, float, __VA_ARGS__)                            \
+  CHECK_NUM(__size, __typeFrom, long, __VA_ARGS__)                             \
+  CHECK_NUM(__size, __typeFrom, int, __VA_ARGS__)                              \
+  CHECK_NUM(__size, __typeFrom, short, __VA_ARGS__)                            \
+  CHECK_NUM(__size, __typeFrom, char, __VA_ARGS__)                             \
+  CHECK_NUM(__size, __typeFrom, BitInt8, __VA_ARGS__)                          \
+  CHECK_NUM(__size, __typeFrom, BitInt32, __VA_ARGS__)                         \
+  CHECK_NUM(__size, __typeFrom, BitInt128, __VA_ARGS__)                        \
+  static_assert(                                                               \
+      __builtin_bit_cast(                                                      \
+          unsigned,                                                            \
+          __builtin_shufflevector(                                             \
+              from_vector##__size##__typeFrom##_to_vector##__size##char_var,   \
+              from_vector##__size##__typeFrom##_to_vector##__size##char_var,   \
+              0, 1, 2, 3)) == (LITTLE_END ? 0x03020100 : 0x00010203));         \
+  static_assert(                                                               \
+      __builtin_bit_cast(                                                      \
+          unsigned long long,                                                  \
+          __builtin_shufflevector(                                             \
+              from_vector##__size##__typeFrom##_to_vector##__size##short_var,  \
+              from_vector##__size##__typeFrom##_to_vector##__size##short_var,  \
+              0, 1, 2, 3)) ==                                                  \
+      (LITTLE_END ? 0x0003000200010000 : 0x0000000100020003));
+
+#define CHECK_ALL_COMBINATIONS(__size, ...)                                    \
+  CHECK_TO_ALL_TYPES(__size, double, __VA_ARGS__)                              \
+  CHECK_TO_ALL_TYPES(__size, float, __VA_ARGS__)                               \
+  CHECK_TO_ALL_TYPES(__size, long, __VA_ARGS__)                                \
+  CHECK_TO_ALL_TYPES(__size, int, __VA_ARGS__)                                 \
+  CHECK_TO_ALL_TYPES(__size, short, __VA_ARGS__)                               \
+  CHECK_TO_ALL_TYPES(__size, char, __VA_ARGS__)                                \
+  CHECK_TO_ALL_TYPES(__size, BitInt8, __VA_ARGS__)                             \
+  CHECK_TO_ALL_TYPES(__size, BitInt32, __VA_ARGS__)                            \
+  CHECK_TO_ALL_TYPES(__size, BitInt128, __VA_ARGS__)
+
+// The result below is expanded from these macros. Use them to autogenerate the
+// test cases below.
+// CHECK_ALL_COMBINATIONS(4, 0, 1, 2, 3);
+// CHECK_ALL_COMBINATIONS(8, 0, 1, 2, 3, 4, 5, 6, 7);
+
+constexpr vector4double from_vector4double_to_vector4double_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4double_to_vector4float_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4double_to_vector4long_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4double_to_vector4int_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4double_to_vector4short_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4double_to_vector4char_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4double_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4double_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4double_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(
+                  unsigned,
+                  __builtin_shufflevector(from_vector4double_to_vector4char_var,
+                                          from_vector4double_to_vector4char_var,
+                                          0, 1, 2, 3)) ==
+              (1 ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector4double_to_vector4short_var,
+                                     from_vector4double_to_vector4short_var, 0,
+                                     1, 2, 3)) ==
+              (1 ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4float_to_vector4double_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4float_to_vector4float_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4float_to_vector4long_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4float_to_vector4int_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4float_to_vector4short_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4float_to_vector4char_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4float_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4float_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4float_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4float_to_vector4char_var,
+                                     from_vector4float_to_vector4char_var, 0, 1,
+                                     2, 3)) == (1 ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector4float_to_vector4short_var,
+                                          from_vector4float_to_vector4short_var,
+                                          0, 1, 2, 3)) ==
+              (1 ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4long_to_vector4double_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4long_to_vector4float_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4long_to_vector4long_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4long_to_vector4int_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4long_to_vector4short_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4long_to_vector4char_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4long_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4long_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4long_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4long_to_vector4char_var,
+                                     from_vector4long_to_vector4char_var, 0, 1,
+                                     2, 3)) == (1 ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector4long_to_vector4short_var,
+                                          from_vector4long_to_vector4short_var,
+                                          0, 1, 2, 3)) ==
+              (1 ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4int_to_vector4double_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4int_to_vector4float_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4int_to_vector4long_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4int_to_vector4int_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4int_to_vector4short_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4int_to_vector4char_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4int_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4int_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4int_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4int_to_vector4char_var,
+                                     from_vector4int_to_vector4char_var, 0, 1,
+                                     2, 3)) == (1 ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector4int_to_vector4short_var,
+                                          from_vector4int_to_vector4short_var,
+                                          0, 1, 2, 3)) ==
+              (1 ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4short_to_vector4double_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4short_to_vector4float_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4short_to_vector4long_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4short_to_vector4int_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4short_to_vector4short_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4short_to_vector4char_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4short_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4short_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4short_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4short_to_vector4char_var,
+                                     from_vector4short_to_vector4char_var, 0, 1,
+                                     2, 3)) == (1 ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector4short_to_vector4short_var,
+                                          from_vector4short_to_vector4short_var,
+                                          0, 1, 2, 3)) ==
+              (1 ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4char_to_vector4double_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4char_to_vector4float_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4char_to_vector4long_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4char_to_vector4int_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4char_to_vector4short_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4char_to_vector4char_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4char_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4char_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4char_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4char_to_vector4char_var,
+                                     from_vector4char_to_vector4char_var, 0, 1,
+                                     2, 3)) == (1 ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector4char_to_vector4short_var,
+                                          from_vector4char_to_vector4short_var,
+                                          0, 1, 2, 3)) ==
+              (1 ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4BitInt8_to_vector4double_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4BitInt8_to_vector4float_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4BitInt8_to_vector4long_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4BitInt8_to_vector4int_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4BitInt8_to_vector4short_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4BitInt8_to_vector4char_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4BitInt8_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4BitInt8_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4BitInt8_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4BitInt8){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4BitInt8_to_vector4char_var,
+                                     from_vector4BitInt8_to_vector4char_var, 0,
+                                     1, 2, 3)) ==
+              (1 ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector4BitInt8_to_vector4short_var,
+                                     from_vector4BitInt8_to_vector4short_var, 0,
+                                     1, 2, 3)) ==
+              (1 ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4BitInt32_to_vector4double_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4BitInt32_to_vector4float_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4BitInt32_to_vector4long_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4BitInt32_to_vector4int_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4BitInt32_to_vector4short_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4BitInt32_to_vector4char_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4BitInt32_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4BitInt32_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4BitInt32_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4BitInt32){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4BitInt32_to_vector4char_var,
+                                     from_vector4BitInt32_to_vector4char_var, 0,
+                                     1, 2, 3)) ==
+              (1 ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector4BitInt32_to_vector4short_var,
+                                     from_vector4BitInt32_to_vector4short_var,
+                                     0, 1, 2, 3)) ==
+              (1 ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector4double from_vector4BitInt128_to_vector4double_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4double);
+constexpr vector4float from_vector4BitInt128_to_vector4float_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4float);
+constexpr vector4long from_vector4BitInt128_to_vector4long_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4long);
+constexpr vector4int from_vector4BitInt128_to_vector4int_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4int);
+constexpr vector4short from_vector4BitInt128_to_vector4short_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4short);
+constexpr vector4char from_vector4BitInt128_to_vector4char_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4char);
+constexpr vector4BitInt8 from_vector4BitInt128_to_vector4BitInt8_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4BitInt8);
+constexpr vector4BitInt32 from_vector4BitInt128_to_vector4BitInt32_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4BitInt32);
+constexpr vector4BitInt128 from_vector4BitInt128_to_vector4BitInt128_var =
+    __builtin_convertvector((vector4BitInt128){0, 1, 2, 3}, vector4BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector4BitInt128_to_vector4char_var,
+                                     from_vector4BitInt128_to_vector4char_var,
+                                     0, 1, 2, 3)) ==
+              (1 ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector4BitInt128_to_vector4short_var,
+                                     from_vector4BitInt128_to_vector4short_var,
+                                     0, 1, 2, 3)) ==
+              (1 ? 0x0003000200010000 : 0x0000000100020003));
+;
+constexpr vector8double from_vector8double_to_vector8double_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8double_to_vector8float_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8double_to_vector8long_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8long);
+constexpr vector8int from_vector8double_to_vector8int_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8int);
+constexpr vector8short from_vector8double_to_vector8short_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8double_to_vector8char_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8char);
+constexpr vector8BitInt8 from_vector8double_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8double_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8double_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(
+                  unsigned,
+                  __builtin_shufflevector(from_vector8double_to_vector8char_var,
+                                          from_vector8double_to_vector8char_var,
+                                          0, 1, 2, 3)) ==
+              (1 ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector8double_to_vector8short_var,
+                                     from_vector8double_to_vector8short_var, 0,
+                                     1, 2, 3)) ==
+              (1 ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8float_to_vector8double_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8float_to_vector8float_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8float_to_vector8long_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8long);
+constexpr vector8int from_vector8float_to_vector8int_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+constexpr vector8short from_vector8float_to_vector8short_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8float_to_vector8char_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8char);
+constexpr vector8BitInt8 from_vector8float_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8float_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8float_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8float_to_vector8char_var,
+                                     from_vector8float_to_vector8char_var, 0, 1,
+                                     2, 3)) == (1 ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector8float_to_vector8short_var,
+                                          from_vector8float_to_vector8short_var,
+                                          0, 1, 2, 3)) ==
+              (1 ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8long_to_vector8double_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8long_to_vector8float_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8long_to_vector8long_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7}, vector8long);
+constexpr vector8int from_vector8long_to_vector8int_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+constexpr vector8short from_vector8long_to_vector8short_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8long_to_vector8char_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7}, vector8char);
+constexpr vector8BitInt8 from_vector8long_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8long_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8long_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8long_to_vector8char_var,
+                                     from_vector8long_to_vector8char_var, 0, 1,
+                                     2, 3)) == (1 ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector8long_to_vector8short_var,
+                                          from_vector8long_to_vector8short_var,
+                                          0, 1, 2, 3)) ==
+              (1 ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8int_to_vector8double_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8int_to_vector8float_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8float);
+constexpr vector8long from_vector8int_to_vector8long_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8long);
+constexpr vector8int from_vector8int_to_vector8int_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+constexpr vector8short from_vector8int_to_vector8short_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8short);
+constexpr vector8char from_vector8int_to_vector8char_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8char);
+constexpr vector8BitInt8 from_vector8int_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8int_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8int_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8int_to_vector8char_var,
+                                     from_vector8int_to_vector8char_var, 0, 1,
+                                     2, 3)) == (1 ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector8int_to_vector8short_var,
+                                          from_vector8int_to_vector8short_var,
+                                          0, 1, 2, 3)) ==
+              (1 ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8short_to_vector8double_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8short_to_vector8float_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8short_to_vector8long_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8long);
+constexpr vector8int from_vector8short_to_vector8int_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+constexpr vector8short from_vector8short_to_vector8short_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8short_to_vector8char_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8char);
+constexpr vector8BitInt8 from_vector8short_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8short_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8short_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8short_to_vector8char_var,
+                                     from_vector8short_to_vector8char_var, 0, 1,
+                                     2, 3)) == (1 ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector8short_to_vector8short_var,
+                                          from_vector8short_to_vector8short_var,
+                                          0, 1, 2, 3)) ==
+              (1 ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8char_to_vector8double_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8char_to_vector8float_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8char_to_vector8long_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7}, vector8long);
+constexpr vector8int from_vector8char_to_vector8int_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+constexpr vector8short from_vector8char_to_vector8short_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8char_to_vector8char_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7}, vector8char);
+constexpr vector8BitInt8 from_vector8char_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8char_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8char_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8char_to_vector8char_var,
+                                     from_vector8char_to_vector8char_var, 0, 1,
+                                     2, 3)) == (1 ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(
+                  unsigned long long,
+                  __builtin_shufflevector(from_vector8char_to_vector8short_var,
+                                          from_vector8char_to_vector8short_var,
+                                          0, 1, 2, 3)) ==
+              (1 ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8BitInt8_to_vector8double_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8BitInt8_to_vector8float_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8BitInt8_to_vector8long_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8long);
+constexpr vector8int from_vector8BitInt8_to_vector8int_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8int);
+constexpr vector8short from_vector8BitInt8_to_vector8short_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8BitInt8_to_vector8char_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8char);
+constexpr vector8BitInt8 from_vector8BitInt8_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8BitInt8_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8BitInt8_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8BitInt8){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8BitInt8_to_vector8char_var,
+                                     from_vector8BitInt8_to_vector8char_var, 0,
+                                     1, 2, 3)) ==
+              (1 ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector8BitInt8_to_vector8short_var,
+                                     from_vector8BitInt8_to_vector8short_var, 0,
+                                     1, 2, 3)) ==
+              (1 ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8BitInt32_to_vector8double_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8BitInt32_to_vector8float_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8BitInt32_to_vector8long_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8long);
+constexpr vector8int from_vector8BitInt32_to_vector8int_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8int);
+constexpr vector8short from_vector8BitInt32_to_vector8short_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8BitInt32_to_vector8char_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8char);
+constexpr vector8BitInt8 from_vector8BitInt32_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8BitInt32_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8BitInt32_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8BitInt32){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8BitInt32_to_vector8char_var,
+                                     from_vector8BitInt32_to_vector8char_var, 0,
+                                     1, 2, 3)) ==
+              (1 ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector8BitInt32_to_vector8short_var,
+                                     from_vector8BitInt32_to_vector8short_var,
+                                     0, 1, 2, 3)) ==
+              (1 ? 0x0003000200010000 : 0x0000000100020003));
+constexpr vector8double from_vector8BitInt128_to_vector8double_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8double);
+constexpr vector8float from_vector8BitInt128_to_vector8float_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8float);
+constexpr vector8long from_vector8BitInt128_to_vector8long_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8long);
+constexpr vector8int from_vector8BitInt128_to_vector8int_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8int);
+constexpr vector8short from_vector8BitInt128_to_vector8short_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8short);
+constexpr vector8char from_vector8BitInt128_to_vector8char_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8char);
+constexpr vector8BitInt8 from_vector8BitInt128_to_vector8BitInt8_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt8);
+constexpr vector8BitInt32 from_vector8BitInt128_to_vector8BitInt32_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt32);
+constexpr vector8BitInt128 from_vector8BitInt128_to_vector8BitInt128_var =
+    __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                            vector8BitInt128);
+static_assert(__builtin_bit_cast(unsigned,
+                                 __builtin_shufflevector(
+                                     from_vector8BitInt128_to_vector8char_var,
+                                     from_vector8BitInt128_to_vector8char_var,
+                                     0, 1, 2, 3)) ==
+              (1 ? 0x03020100 : 0x00010203));
+static_assert(__builtin_bit_cast(unsigned long long,
+                                 __builtin_shufflevector(
+                                     from_vector8BitInt128_to_vector8short_var,
+                                     from_vector8BitInt128_to_vector8short_var,
+                                     0, 1, 2, 3)) ==
+              (1 ? 0x0003000200010000 : 0x0000000100020003));
+;
+#undef CHECK_ALL_COMBINATIONS
+#undef CHECK_TO_ALL_TYPES
+#undef CHECK_NUM
+
+// Shuffle vector
+constexpr vector4char vector4charConst1 = {0, 1, 2, 3};
+constexpr vector4char vector4charConst2 = {4, 5, 6, 7};
+constexpr vector8char vector8intConst = {8, 9, 10, 11, 12, 13, 14, 15};
+
+constexpr vector4char vectorShuffle1 =
+    __builtin_shufflevector(vector4charConst1, vector4charConst2, 0, 1, 2, 3);
+static_assert(__builtin_bit_cast(unsigned, vectorShuffle1) ==
+              (LITTLE_END ? 0x03020100 : 0x00010203));
+constexpr vector4char vectorShuffle2 =
+    __builtin_shufflevector(vector4charConst1, vector4charConst2, 4, 5, 6, 7);
+static_assert(__builtin_bit_cast(unsigned, vectorShuffle2) ==
+              (LITTLE_END ? 0x07060504 : 0x04050607));
+constexpr vector4char vectorShuffle3 =
+    __builtin_shufflevector(vector4charConst1, vector4charConst2, 0, 2, 4, 6);
+static_assert(__builtin_bit_cast(unsigned, vectorShuffle3) ==
+              (LITTLE_END ? 0x06040200 : 0x00020406));
+constexpr vector8char vectorShuffle4 = __builtin_shufflevector(
+    vector8intConst, vector8intConst, 0, 2, 4, 6, 8, 10, 12, 14);
+static_assert(__builtin_bit_cast(unsigned long long, vectorShuffle4) ==
+              (LITTLE_END ? 0x0E0C0A080E0C0A08 : 0x080A0C0E080A0C0E));
+constexpr vector4char vectorShuffle5 =
+    __builtin_shufflevector(vector8intConst, vector8intConst, 0, 2, 4, 6);
+static_assert(__builtin_bit_cast(unsigned, vectorShuffle5) ==
+              (LITTLE_END ? 0x0E0C0A08 : 0x080A0C0E));
+constexpr vector8char vectorShuffle6 = __builtin_shufflevector(
+    vector4charConst1, vector4charConst2, 0, 2, 4, 6, 1, 3, 5, 7);
+static_assert(__builtin_bit_cast(unsigned long long, vectorShuffle6) ==
+              (LITTLE_END ? 0x0705030106040200 : 0x0002040601030507));
+
+constexpr vector4char
+    vectorShuffleFail1 = // expected-error {{constexpr variable 'vectorShuffleFail1'\
+ must be initialized by a constant expression}}
+    __builtin_shufflevector( // expected-error {{index for __builtin_shufflevector \
+not within the bounds of the input vectors; index of -1 found at position 0 not \
+permitted in a constexpr context.}}
+        vector4charConst1,
+        vector4charConst2, -1, -1, -1, -1);
diff --git a/clang/test/Sema/convertvector.c b/clang/test/Sema/convertvector.c
index 8ae43c3ba3d4..1ff04af90981 100644
--- a/clang/test/Sema/convertvector.c
+++ b/clang/test/Sema/convertvector.c
@@ -15,3 +15,6 @@ vector8float foo3(double x) {
   return __builtin_convertvector(x, vector8float);  // expected-error {{must be a vector}}
 }
 
+float foo4(float x) {
+  return __builtin_convertvector(x, float); // expected-error {{first argument to __builtin_convertvector must be a vector}}
+}
diff --git a/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp b/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp
index 2f067ea53a50..90404f115c75 100644
--- a/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp
+++ b/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp
@@ -12,14 +12,19 @@ namespace std {
     size_t n;
     initializer_list();
   };
-  // FIXME: This should probably not be necessary.
-  template<typename T> initializer_list(initializer_list<T>) -> initializer_list<T>;
 }
 
 template<typename T> constexpr bool has_type(...) { return false; }
 template<typename T> constexpr bool has_type(T&) { return true; }
 
-std::initializer_list il = {1, 2, 3, 4, 5};
+std::initializer_list il1 = {1, 2, 3, 4, 5};
+auto il2 = std::initializer_list{1, 2, 3, 4};
+auto il3 = std::initializer_list{il1};
+auto il4 = std::initializer_list{il1, il1, il1};
+static_assert(has_type<std::initializer_list<int>>(il1));
+static_assert(has_type<std::initializer_list<int>>(il2));
+static_assert(has_type<std::initializer_list<int>>(il3));
+static_assert(has_type<std::initializer_list<std::initializer_list<int>>>(il4));
 
 template<typename T> struct vector {
   template<typename Iter> vector(Iter, Iter);
diff --git a/clang/test/SemaCXX/member-expr.cpp b/clang/test/SemaCXX/member-expr.cpp
index 0596e40f6c2f..75c9ef0caa2e 100644
--- a/clang/test/SemaCXX/member-expr.cpp
+++ b/clang/test/SemaCXX/member-expr.cpp
@@ -40,8 +40,8 @@ namespace C {
 }
 
 void test2(X *xp) {
-  xp->::i = 7; // expected-error{{'i' is not a member of class 'X'}}
-  xp->C::i = 7; // expected-error{{'C::i' is not a member of class 'X'}}
+  xp->::i = 7; // expected-error{{qualified member access refers to a member in the global namespace}}
+  xp->C::i = 7; // expected-error{{qualified member access refers to a member in namespace 'C'}}
 }
 
 
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index dee4a29bd2bf..01991887b284 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -2509,6 +2509,20 @@ void is_convertible()
   static_assert(__is_convertible(FloatWrapper, IntWrapper));
   static_assert(__is_convertible(FloatWrapper, float));
   static_assert(__is_convertible(float, FloatWrapper));
+  static_assert(__is_convertible(IntWrapper, IntWrapper&&));
+  static_assert(__is_convertible(IntWrapper, const IntWrapper&));
+  static_assert(__is_convertible(IntWrapper, int&&));
+  static_assert(__is_convertible(IntWrapper, const int&));
+  static_assert(__is_convertible(int, IntWrapper&&));
+  static_assert(__is_convertible(int, const IntWrapper&));
+  static_assert(__is_convertible(IntWrapper, FloatWrapper&&));
+  static_assert(__is_convertible(IntWrapper, const FloatWrapper&));
+  static_assert(__is_convertible(FloatWrapper, IntWrapper&&));
+  static_assert(__is_convertible(FloatWrapper, const IntWrapper&&));
+  static_assert(__is_convertible(FloatWrapper, float&&));
+  static_assert(__is_convertible(FloatWrapper, const float&));
+  static_assert(__is_convertible(float, FloatWrapper&&));
+  static_assert(__is_convertible(float, const FloatWrapper&));
 }
 
 void is_nothrow_convertible()
@@ -2521,6 +2535,20 @@ void is_nothrow_convertible()
   static_assert(!__is_nothrow_convertible(FloatWrapper, IntWrapper));
   static_assert(!__is_nothrow_convertible(FloatWrapper, float));
   static_assert(__is_nothrow_convertible(float, FloatWrapper));
+  static_assert(__is_nothrow_convertible(IntWrapper, IntWrapper&&));
+  static_assert(__is_nothrow_convertible(IntWrapper, const IntWrapper&));
+  static_assert(__is_nothrow_convertible(IntWrapper, int&&));
+  static_assert(__is_nothrow_convertible(IntWrapper, const int&));
+  static_assert(!__is_nothrow_convertible(int, IntWrapper&&));
+  static_assert(!__is_nothrow_convertible(int, const IntWrapper&));
+  static_assert(!__is_nothrow_convertible(IntWrapper, FloatWrapper&&));
+  static_assert(!__is_nothrow_convertible(IntWrapper, const FloatWrapper&));
+  static_assert(!__is_nothrow_convertible(FloatWrapper, IntWrapper&&));
+  static_assert(!__is_nothrow_convertible(FloatWrapper, const IntWrapper&));
+  static_assert(!__is_nothrow_convertible(FloatWrapper, float&&));
+  static_assert(!__is_nothrow_convertible(FloatWrapper, const float&));
+  static_assert(__is_nothrow_convertible(float, FloatWrapper&&));
+  static_assert(__is_nothrow_convertible(float, const FloatWrapper&));
 }
 
 struct FromInt { FromInt(int); };
diff --git a/clang/test/SemaCXX/unused.cpp b/clang/test/SemaCXX/unused.cpp
index 0af9e5b68b00..1f40c1b1ca90 100644
--- a/clang/test/SemaCXX/unused.cpp
+++ b/clang/test/SemaCXX/unused.cpp
@@ -102,11 +102,21 @@ namespace PR33839 {
     for (auto [x] : a) { // expected-warning {{unused variable '[x]'}}
     }
   }
-  void use() { 
+  void use() {
     f<int>(); // expected-note {{instantiation of}}
     g<true>();
     g<false>();
     h<int>(); // expected-note {{instantiation of}}
   }
 }
+
+namespace maybe_unused_binding {
+
+void test() {
+  struct X { int a, b; } x;
+  auto [a [[maybe_unused]], b] = x; // expected-warning {{an attribute specifier sequence attached to a structured binding declaration is a C++2c extension}}
+}
+
+}
+
 #endif
diff --git a/clang/test/SemaObjC/format-strings-oslog.m b/clang/test/SemaObjC/format-strings-oslog.m
index 20fec93b653b..af5aef3d6179 100644
--- a/clang/test/SemaObjC/format-strings-oslog.m
+++ b/clang/test/SemaObjC/format-strings-oslog.m
@@ -44,15 +44,18 @@ void test_os_log_format(const char *pc, int i, void *p, void *buf) {
 }
 
 // Test os_log_format primitive with ObjC string literal format argument.
-void test_objc(const char *pc, int i, void *p, void *buf, NSString *nss) {
+void test_objc(const char *pc, int i, void *p, void *buf, NSString *nss, id obj) {
   __builtin_os_log_format(buf, @"");
   __builtin_os_log_format(buf, @"%d"); // expected-warning {{more '%' conversions than data arguments}}
   __builtin_os_log_format(buf, @"%d", i);
+
   __builtin_os_log_format(buf, @"%P", p); // expected-warning {{using '%P' format specifier without precision}}
   __builtin_os_log_format(buf, @"%.10P", p);
   __builtin_os_log_format(buf, @"%.*P", p); // expected-warning {{field precision should have type 'int', but argument has type 'void *'}}
   __builtin_os_log_format(buf, @"%.*P", i, p);
   __builtin_os_log_format(buf, @"%.*P", i, i); // expected-warning {{format specifies type 'void *' but the argument has type 'int'}}
+  __builtin_os_log_format(buf, @"%.8P", nss);    // expected-warning {{using '%P' format specifier with an Objective-C pointer results in dumping runtime object structure, not object value}}
+  __builtin_os_log_format(buf, @"%.*P", i, obj); // expected-warning {{using '%P' format specifier with an Objective-C pointer results in dumping runtime object structure, not object value}}
 
   __builtin_os_log_format(buf, @"%{private}s", pc);
   __builtin_os_log_format(buf, @"%@", nss);
diff --git a/clang/test/SemaOpenCL/vec_step.cl b/clang/test/SemaOpenCL/vec_step.cl
index afb6dc94d92e..c116f09b351f 100644
--- a/clang/test/SemaOpenCL/vec_step.cl
+++ b/clang/test/SemaOpenCL/vec_step.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -pedantic -verify %s
+// RUN: %clang_cc1 -fsyntax-only -pedantic -verify %s -fexperimental-new-constant-interpreter
 
 typedef int int2 __attribute__((ext_vector_type(2)));
 typedef int int3 __attribute__((ext_vector_type(3)));
diff --git a/clang/test/SemaTemplate/instantiate-function-1.cpp b/clang/test/SemaTemplate/instantiate-function-1.cpp
index a4967264c654..ceef27437748 100644
--- a/clang/test/SemaTemplate/instantiate-function-1.cpp
+++ b/clang/test/SemaTemplate/instantiate-function-1.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify %s
 template<typename T, typename U>
 struct X0 {
-  void f(T x, U y) {
+  void f(T x, U y) { 
     (void)(x + y); // expected-error{{invalid operands}}
   }
 };
@@ -41,7 +41,7 @@ template <typename T> struct X4 {
   T f() const {
     return; // expected-error{{non-void function 'f' should return a value}}
   }
-
+  
   T g() const {
     return 1; // expected-error{{void function 'g' should not return a value}}
   }
@@ -64,7 +64,7 @@ template<typename T, typename U, typename V> struct X6 {
     // IfStmt
     if (t > 0)
       return u;
-    else {
+    else { 
       if (t < 0)
         return v; // expected-error{{cannot initialize return object of type}}
     }
@@ -131,12 +131,12 @@ template<typename T> struct Member0 {
     t;
     t.f;
     t->f;
-
+    
     T* tp;
     tp.f; // expected-error{{member reference base type 'T *' is not a structure or union}}
     tp->f;
 
-    this->f; // expected-error{{reference to non-static member function must be called}}
+    this->f;
     this.f; // expected-error{{member reference base type 'Member0<T> *' is not a structure or union}}
   }
 };
@@ -239,11 +239,11 @@ namespace PR9880 {
     static yes_tag check(char[sizeof(&U::luaIndex)]);
     enum { value = sizeof(check<T>(0)) == sizeof(yes_tag) };
   };
-
+  
   class SomeClass {
   public:
     int luaIndex(lua_State* L);
   };
-
+  
   int i = HasIndexMetamethod<SomeClass>::value;
 }
diff --git a/clang/test/Unit/lit.cfg.py b/clang/test/Unit/lit.cfg.py
index 475069e630d7..37e91d0f8629 100644
--- a/clang/test/Unit/lit.cfg.py
+++ b/clang/test/Unit/lit.cfg.py
@@ -25,13 +25,9 @@ config.test_format = lit.formats.GoogleTest(config.llvm_build_mode, "Tests")
 
 # Propagate the temp directory. Windows requires this because it uses \Windows\
 # if none of these are present.
-if "TMP" in os.environ:
-    config.environment["TMP"] = os.environ["TMP"]
-if "TEMP" in os.environ:
-    config.environment["TEMP"] = os.environ["TEMP"]
-
-if "HOME" in os.environ:
-    config.environment["HOME"] = os.environ["HOME"]
+for v in ["TMP", "TEMP", "HOME", "SystemDrive"]:
+    if v in os.environ:
+        config.environment[v] = os.environ[v]
 
 # Propagate sanitizer options.
 for var in [
diff --git a/clang/tools/driver/cc1as_main.cpp b/clang/tools/driver/cc1as_main.cpp
index 5498c3f9d4a2..86afe22fac24 100644
--- a/clang/tools/driver/cc1as_main.cpp
+++ b/clang/tools/driver/cc1as_main.cpp
@@ -426,6 +426,7 @@ static bool ExecuteAssemblerImpl(AssemblerInvocation &Opts,
   assert(MRI && "Unable to create target register info!");
 
   MCTargetOptions MCOptions;
+  MCOptions.MCRelaxAll = Opts.RelaxAll;
   MCOptions.EmitDwarfUnwind = Opts.EmitDwarfUnwind;
   MCOptions.EmitCompactUnwindNonCanonical = Opts.EmitCompactUnwindNonCanonical;
   MCOptions.X86RelaxRelocations = Opts.RelaxELFRelocations;
diff --git a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
index c08deb903f12..87774b00956a 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
@@ -1569,9 +1569,8 @@ TEST_P(ASTMatchersTest, IsArrow_MatchesMemberVariablesViaArrow) {
       matches("class Y { void x() { y; } int y; };", memberExpr(isArrow())));
   EXPECT_TRUE(notMatches("class Y { void x() { (*this).y; } int y; };",
                          memberExpr(isArrow())));
-  EXPECT_TRUE(
-      matches("template <class T> class Y { void x() { this->m; } int m; };",
-              memberExpr(isArrow())));
+  EXPECT_TRUE(matches("template <class T> class Y { void x() { this->m; } };",
+                      cxxDependentScopeMemberExpr(isArrow())));
   EXPECT_TRUE(
       notMatches("template <class T> class Y { void x() { (*this).m; } };",
                  cxxDependentScopeMemberExpr(isArrow())));
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index d204700919d3..301bec32c0cf 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -5357,6 +5357,38 @@ TEST(TransferTest, ConditionalOperatorLocation) {
       });
 }
 
+TEST(TransferTest, ConditionalOperatorOnConstantExpr) {
+  // This is a regression test: We used to crash when a `ConstantExpr` was used
+  // in the branches of a conditional operator.
+  std::string Code = R"cc(
+    consteval bool identity(bool B) { return B; }
+    void target(bool Cond) {
+      bool JoinTrueTrue = Cond ? identity(true) : identity(true);
+      bool JoinTrueFalse = Cond ? identity(true) : identity(false);
+      // [[p]]
+    }
+  )cc";
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        Environment Env = getEnvironmentAtAnnotation(Results, "p").fork();
+
+        auto &JoinTrueTrue =
+            getValueForDecl<BoolValue>(ASTCtx, Env, "JoinTrueTrue");
+        // FIXME: This test documents the current behavior, namely that we
+        // don't actually use the constant result of the `ConstantExpr` and
+        // instead treat it like a normal function call.
+        EXPECT_EQ(JoinTrueTrue.formula().kind(), Formula::Kind::AtomRef);
+        // EXPECT_TRUE(JoinTrueTrue.formula().literal());
+
+        auto &JoinTrueFalse =
+            getValueForDecl<BoolValue>(ASTCtx, Env, "JoinTrueFalse");
+        EXPECT_EQ(JoinTrueFalse.formula().kind(), Formula::Kind::AtomRef);
+      },
+      LangStandard::lang_cxx20);
+}
+
 TEST(TransferTest, IfStmtBranchExtendsFlowCondition) {
   std::string Code = R"(
     void target(bool Foo) {
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index bc61b9c089e9..32ba6b6853c7 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -24507,16 +24507,25 @@ TEST_F(FormatTest, AlternativeOperators) {
   verifyFormat("int a compl(5);");
   verifyFormat("int a not(5);");
 
-  /* FIXME handle alternate tokens
-   * https://en.cppreference.com/w/cpp/language/operator_alternative
-  // alternative tokens
-  verifyFormat("compl foo();");     //  ~foo();
-  verifyFormat("foo() <%%>;");      // foo();
-  verifyFormat("void foo() <%%>;"); // void foo(){}
-  verifyFormat("int a <:1:>;");     // int a[1];[
+  verifyFormat("compl foo();");     // ~foo();
+  verifyFormat("foo() <%%>");       // foo() {}
+  verifyFormat("void foo() <%%>");  // void foo() {}
+  verifyFormat("int a<:1:>;");      // int a[1];
   verifyFormat("%:define ABC abc"); // #define ABC abc
   verifyFormat("%:%:");             // ##
-  */
+
+  verifyFormat("a = v(not;);\n"
+               "b = v(not+);\n"
+               "c = v(not x);\n"
+               "d = v(not 1);\n"
+               "e = v(not 123.f);");
+
+  verifyNoChange("#define ASSEMBLER_INSTRUCTION_LIST(V)  \\\n"
+                 "  V(and)                               \\\n"
+                 "  V(not)                               \\\n"
+                 "  V(not!)                              \\\n"
+                 "  V(other)",
+                 getLLVMStyleWithColumns(40));
 }
 
 TEST_F(FormatTest, STLWhileNotDefineChed) {
@@ -27354,6 +27363,45 @@ TEST_F(FormatTest, BreakAdjacentStringLiterals) {
   verifyFormat(Code, Style);
 }
 
+TEST_F(FormatTest, AlignUTFCommentsAndStringLiterals) {
+  verifyFormat(
+      "int rus;      // А теперь комментарии, например, на русском, 2-байта\n"
+      "int long_rus; // Верхний коммент еще не превысил границу в 80, однако\n"
+      "              // уже отодвинут. Перенос, при этом, отрабатывает верно");
+
+  auto Style = getLLVMStyle();
+  Style.ColumnLimit = 15;
+  verifyNoChange("#define test  \\\n"
+                 "  /* 测试 */  \\\n"
+                 "  \"aa\"        \\\n"
+                 "  \"bb\"",
+                 Style);
+
+  Style.ColumnLimit = 25;
+  verifyFormat("struct foo {\n"
+               "  int iiiiii; ///< iiiiii\n"
+               "  int b;      ///< ыыы\n"
+               "  int c;      ///< ыыыы\n"
+               "};",
+               Style);
+
+  Style.ColumnLimit = 35;
+  verifyFormat("#define SENSOR_DESC_1             \\\n"
+               "  \"{\"                             \\\n"
+               "  \"unit_of_measurement: \\\"°C\\\",\"  \\\n"
+               "  \"}\"",
+               Style);
+
+  Style.ColumnLimit = 80;
+  Style.AlignArrayOfStructures = FormatStyle::AIAS_Left;
+  verifyFormat("Languages languages = {\n"
+               "    Language{{'e', 'n'}, U\"Test English\" },\n"
+               "    Language{{'l', 'v'}, U\"Test Latviešu\"},\n"
+               "    Language{{'r', 'u'}, U\"Test Русский\" },\n"
+               "};",
+               Style);
+}
+
 } // namespace
 } // namespace test
 } // namespace format
diff --git a/clang/unittests/Format/SortIncludesTest.cpp b/clang/unittests/Format/SortIncludesTest.cpp
index 772eb53806b4..824fa0078cd0 100644
--- a/clang/unittests/Format/SortIncludesTest.cpp
+++ b/clang/unittests/Format/SortIncludesTest.cpp
@@ -6,19 +6,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "FormatTestUtils.h"
+#include "FormatTestBase.h"
 #include "clang/Format/Format.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
 #include "gtest/gtest.h"
 
-#define DEBUG_TYPE "format-test"
+#define DEBUG_TYPE "sort-includes-test"
 
 namespace clang {
 namespace format {
 namespace {
 
-class SortIncludesTest : public ::testing::Test {
+class SortIncludesTest : public test::FormatTestBase {
 protected:
   std::vector<tooling::Range> GetCodeRange(StringRef Code) {
     return std::vector<tooling::Range>(1, tooling::Range(0, Code.size()));
@@ -821,6 +821,122 @@ TEST_F(SortIncludesTest, CalculatesCorrectCursorPositionWithRegrouping) {
   EXPECT_EQ(27u, newCursor(Code, 28)); // Start of last line
 }
 
+TEST_F(SortIncludesTest,
+       CalculatesCorrectCursorPositionWhenNoReplacementsWithRegroupingAndCRLF) {
+  Style.IncludeBlocks = Style.IBS_Regroup;
+  FmtStyle.LineEnding = FormatStyle::LE_CRLF;
+  Style.IncludeCategories = {
+      {"^\"a\"", 0, 0, false}, {"^\"b\"", 1, 1, false}, {".*", 2, 2, false}};
+  std::string Code = "#include \"a\"\r\n" // Start of line: 0
+                     "\r\n"               // Start of line: 14
+                     "#include \"b\"\r\n" // Start of line: 16
+                     "\r\n"               // Start of line: 30
+                     "#include \"c\"\r\n" // Start of line: 32
+                     "\r\n"               // Start of line: 46
+                     "int i;";            // Start of line: 48
+  verifyNoChange(Code);
+  EXPECT_EQ(0u, newCursor(Code, 0));
+  EXPECT_EQ(14u, newCursor(Code, 14));
+  EXPECT_EQ(16u, newCursor(Code, 16));
+  EXPECT_EQ(30u, newCursor(Code, 30));
+  EXPECT_EQ(32u, newCursor(Code, 32));
+  EXPECT_EQ(46u, newCursor(Code, 46));
+  EXPECT_EQ(48u, newCursor(Code, 48));
+}
+
+TEST_F(
+    SortIncludesTest,
+    CalculatesCorrectCursorPositionWhenRemoveLinesReplacementsWithRegroupingAndCRLF) {
+  Style.IncludeBlocks = Style.IBS_Regroup;
+  FmtStyle.LineEnding = FormatStyle::LE_CRLF;
+  Style.IncludeCategories = {{".*", 0, 0, false}};
+  std::string Code = "#include \"a\"\r\n"     // Start of line: 0
+                     "\r\n"                   // Start of line: 14
+                     "#include \"b\"\r\n"     // Start of line: 16
+                     "\r\n"                   // Start of line: 30
+                     "#include \"c\"\r\n"     // Start of line: 32
+                     "\r\n"                   // Start of line: 46
+                     "int i;";                // Start of line: 48
+  std::string Expected = "#include \"a\"\r\n" // Start of line: 0
+                         "#include \"b\"\r\n" // Start of line: 14
+                         "#include \"c\"\r\n" // Start of line: 28
+                         "\r\n"               // Start of line: 42
+                         "int i;";            // Start of line: 44
+  EXPECT_EQ(Expected, sort(Code));
+  EXPECT_EQ(0u, newCursor(Code, 0));
+  EXPECT_EQ(
+      14u,
+      newCursor(Code, 14)); // cursor on empty line in include block is ignored
+  EXPECT_EQ(14u, newCursor(Code, 16));
+  EXPECT_EQ(
+      30u,
+      newCursor(Code, 30)); // cursor on empty line in include block is ignored
+  EXPECT_EQ(28u, newCursor(Code, 32));
+  EXPECT_EQ(42u, newCursor(Code, 46));
+  EXPECT_EQ(44u, newCursor(Code, 48));
+}
+
+// FIXME: the tests below should pass.
+#if 0
+TEST_F(
+    SortIncludesTest,
+    CalculatesCorrectCursorPositionWhenNewLineReplacementsWithRegroupingAndCRLF) {
+  Style.IncludeBlocks = Style.IBS_Regroup;
+  FmtStyle.LineEnding = FormatStyle::LE_CRLF;
+  Style.IncludeCategories = {
+      {"^\"a\"", 0, 0, false}, {"^\"b\"", 1, 1, false}, {".*", 2, 2, false}};
+  std::string Code = "#include \"a\"\r\n"     // Start of line: 0
+                     "#include \"b\"\r\n"     // Start of line: 14
+                     "#include \"c\"\r\n"     // Start of line: 28
+                     "\r\n"                   // Start of line: 42
+                     "int i;";                // Start of line: 44
+  std::string Expected = "#include \"a\"\r\n" // Start of line: 0
+                         "\r\n"               // Start of line: 14
+                         "#include \"b\"\r\n" // Start of line: 16
+                         "\r\n"               // Start of line: 30
+                         "#include \"c\"\r\n" // Start of line: 32
+                         "\r\n"               // Start of line: 46
+                         "int i;";            // Start of line: 48
+  EXPECT_EQ(Expected, sort(Code));
+  EXPECT_EQ(0u, newCursor(Code, 0));
+  EXPECT_EQ(15u, newCursor(Code, 16));
+  EXPECT_EQ(30u, newCursor(Code, 32));
+  EXPECT_EQ(44u, newCursor(Code, 46));
+  EXPECT_EQ(46u, newCursor(Code, 48));
+}
+
+TEST_F(
+    SortIncludesTest,
+    CalculatesCorrectCursorPositionWhenNoNewLineReplacementsWithRegroupingAndCRLF) {
+  Style.IncludeBlocks = Style.IBS_Regroup;
+  FmtStyle.LineEnding = FormatStyle::LE_CRLF;
+  Style.IncludeCategories = {
+      {"^\"a\"", 0, 0, false}, {"^\"b\"", 1, 1, false}, {".*", 2, 2, false}};
+  std::string Code = "#include \"a\"\r\n"     // Start of line: 0
+                     "\r\n"                   // Start of line: 14
+                     "#include \"c\"\r\n"     // Start of line: 16
+                     "\r\n"                   // Start of line: 30
+                     "#include \"b\"\r\n"     // Start of line: 32
+                     "\r\n"                   // Start of line: 46
+                     "int i;";                // Start of line: 48
+  std::string Expected = "#include \"a\"\r\n" // Start of line: 0
+                         "\r\n"               // Start of line: 14
+                         "#include \"b\"\r\n" // Start of line: 16
+                         "\r\n"               // Start of line: 30
+                         "#include \"c\"\r\n" // Start of line: 32
+                         "\r\n"               // Start of line: 46
+                         "int i;";            // Start of line: 48
+  EXPECT_EQ(Expected, sort(Code));
+  EXPECT_EQ(0u, newCursor(Code, 0));
+  EXPECT_EQ(14u, newCursor(Code, 14));
+  EXPECT_EQ(30u, newCursor(Code, 32));
+  EXPECT_EQ(30u, newCursor(Code, 30));
+  EXPECT_EQ(15u, newCursor(Code, 15));
+  EXPECT_EQ(44u, newCursor(Code, 46));
+  EXPECT_EQ(46u, newCursor(Code, 48));
+}
+#endif
+
 TEST_F(SortIncludesTest, DeduplicateIncludes) {
   EXPECT_EQ("#include <a>\n"
             "#include <b>\n"
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 83b71e7c122d..875521bd505d 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -1435,7 +1435,7 @@ accessible?</td>
   </tr>
   <tr class="open" id="233">
     <td><a href="https://cplusplus.github.io/CWG/issues/233.html">233</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>References vs pointers in UDC overload resolution</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -2756,7 +2756,7 @@ of class templates</td>
   </tr>
   <tr id="453">
     <td><a href="https://cplusplus.github.io/CWG/issues/453.html">453</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>References may only bind to &#8220;valid&#8221; objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -5812,7 +5812,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="1001">
     <td><a href="https://cplusplus.github.io/CWG/issues/1001.html">1001</a></td>
-    <td>drafting</td>
+    <td>review</td>
     <td>Parameter type adjustment in dependent parameter types</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -6034,7 +6034,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="1038">
     <td><a href="https://cplusplus.github.io/CWG/issues/1038.html">1038</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Overload resolution of <TT>&amp;x.static_func</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -9994,7 +9994,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="1698">
     <td><a href="https://cplusplus.github.io/CWG/issues/1698.html">1698</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Files ending in <TT>\</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -10132,7 +10132,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="1721">
     <td><a href="https://cplusplus.github.io/CWG/issues/1721.html">1721</a></td>
-    <td>drafting</td>
+    <td>review</td>
     <td>Diagnosing ODR violations for static data members</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -11312,11 +11312,11 @@ and <I>POD class</I></td>
     <td>decltype-qualified enumeration names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="1918">
+  <tr id="1918">
     <td><a href="https://cplusplus.github.io/CWG/issues/1918.html">1918</a></td>
-    <td>open</td>
+    <td>CD5</td>
     <td><TT>friend</TT> templates with dependent scopes</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1919">
     <td><a href="https://cplusplus.github.io/CWG/issues/1919.html">1919</a></td>
@@ -11474,11 +11474,11 @@ and <I>POD class</I></td>
     <td>New C incompatibilities</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="1945">
+  <tr id="1945">
     <td><a href="https://cplusplus.github.io/CWG/issues/1945.html">1945</a></td>
-    <td>open</td>
+    <td>CD5</td>
     <td>Friend declarations naming members of class templates in non-templates</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1946">
     <td><a href="https://cplusplus.github.io/CWG/issues/1946.html">1946</a></td>
@@ -11530,7 +11530,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="1954">
     <td><a href="https://cplusplus.github.io/CWG/issues/1954.html">1954</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td><TT>typeid</TT> null dereference check in subexpressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -12098,11 +12098,11 @@ and <I>POD class</I></td>
     <td>C-style casts that cast away constness vs <TT>static_cast</TT></td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2049">
+  <tr id="2049">
     <td><a href="https://cplusplus.github.io/CWG/issues/2049.html">2049</a></td>
-    <td>drafting</td>
+    <td>DRWP</td>
     <td>List initializer in non-type template default argument</td>
-    <td title="Clang 18 implements P2308R1 resolution" align="center">Not Resolved*</td>
+    <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2050">
     <td><a href="https://cplusplus.github.io/CWG/issues/2050.html">2050</a></td>
@@ -12130,7 +12130,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2054">
     <td><a href="https://cplusplus.github.io/CWG/issues/2054.html">2054</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Missing description of class SFINAE</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -12418,7 +12418,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2102">
     <td><a href="https://cplusplus.github.io/CWG/issues/2102.html">2102</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Constructor checking in <I>new-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -12698,11 +12698,11 @@ and <I>POD class</I></td>
     <td>Thread storage duration and order of initialization</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2149">
+  <tr id="2149">
     <td><a href="https://cplusplus.github.io/CWG/issues/2149.html">2149</a></td>
-    <td>drafting</td>
+    <td>DR</td>
     <td>Brace elision and array length deduction</td>
-    <td align="center">Not resolved</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="2150">
     <td><a href="https://cplusplus.github.io/CWG/issues/2150.html">2150</a></td>
@@ -13318,7 +13318,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2252">
     <td><a href="https://cplusplus.github.io/CWG/issues/2252.html">2252</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Enumeration list-initialization from the same type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -14410,7 +14410,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2434">
     <td><a href="https://cplusplus.github.io/CWG/issues/2434.html">2434</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Mandatory copy elision vs non-class objects</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -14504,11 +14504,11 @@ and <I>POD class</I></td>
     <td>Thunks as an implementation technique for pointers to virtual functions</td>
     <td align="center">Extension</td>
   </tr>
-  <tr class="open" id="2450">
+  <tr id="2450">
     <td><a href="https://cplusplus.github.io/CWG/issues/2450.html">2450</a></td>
-    <td>review</td>
+    <td>DRWP</td>
     <td><I>braced-init-list</I> as a <I>template-argument</I></td>
-    <td title="Clang 18 implements P2308R1 resolution" align="center">Not Resolved*</td>
+    <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2451">
     <td><a href="https://cplusplus.github.io/CWG/issues/2451.html">2451</a></td>
@@ -14558,11 +14558,11 @@ and <I>POD class</I></td>
     <td>Value category of expressions denoting non-static member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2459">
+  <tr id="2459">
     <td><a href="https://cplusplus.github.io/CWG/issues/2459.html">2459</a></td>
-    <td>drafting</td>
+    <td>DRWP</td>
     <td>Template parameter initialization</td>
-    <td title="Clang 18 implements P2308R1 resolution" align="center">Not Resolved*</td>
+    <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2460">
     <td><a href="https://cplusplus.github.io/CWG/issues/2460.html">2460</a></td>
@@ -14662,7 +14662,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2476">
     <td><a href="https://cplusplus.github.io/CWG/issues/2476.html">2476</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td><I>placeholder-type-specifier</I>s and function declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -14830,7 +14830,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2504">
     <td><a href="https://cplusplus.github.io/CWG/issues/2504.html">2504</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Inheriting constructors from virtual base classes</td>
     <td class="none" align="center">No</td>
   </tr>
@@ -14992,7 +14992,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2531">
     <td><a href="https://cplusplus.github.io/CWG/issues/2531.html">2531</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Static data members redeclared as constexpr</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15002,11 +15002,11 @@ and <I>POD class</I></td>
     <td>Kind of pointer value returned by <TT>new T[0]</TT></td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2533">
+  <tr id="2533">
     <td><a href="https://cplusplus.github.io/CWG/issues/2533.html">2533</a></td>
-    <td>review</td>
+    <td>DR</td>
     <td>Storage duration of implicitly created objects</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2534">
     <td><a href="https://cplusplus.github.io/CWG/issues/2534.html">2534</a></td>
@@ -15082,13 +15082,13 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2546">
     <td><a href="https://cplusplus.github.io/CWG/issues/2546.html">2546</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Defaulted secondary comparison operators defined as deleted</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2547">
     <td><a href="https://cplusplus.github.io/CWG/issues/2547.html">2547</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Defaulted comparison operator function for non-classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15142,7 +15142,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2556">
     <td><a href="https://cplusplus.github.io/CWG/issues/2556.html">2556</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Unusable <TT>promise::return_void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15166,15 +15166,15 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2560">
     <td><a href="https://cplusplus.github.io/CWG/issues/2560.html">2560</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Parameter type determination in a <I>requirement-parameter-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2561">
     <td><a href="https://cplusplus.github.io/CWG/issues/2561.html">2561</a></td>
-    <td>review</td>
+    <td>tentatively ready</td>
     <td>Conversion to function pointer for lambda with explicit object parameter</td>
-    <td title="Clang 18 implements 2023-11-09 resolution" align="center">Not Resolved*</td>
+    <td title="Clang does not implement 2024-03-18 resolution" align="center">Not Resolved*</td>
   </tr>
   <tr class="open" id="2562">
     <td><a href="https://cplusplus.github.io/CWG/issues/2562.html">2562</a></td>
@@ -15214,7 +15214,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2568">
     <td><a href="https://cplusplus.github.io/CWG/issues/2568.html">2568</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Access checking during synthesis of defaulted comparison operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15226,7 +15226,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2570">
     <td><a href="https://cplusplus.github.io/CWG/issues/2570.html">2570</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Clarify constexpr for defaulted functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15334,7 +15334,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2588">
     <td><a href="https://cplusplus.github.io/CWG/issues/2588.html">2588</a></td>
-    <td>drafting</td>
+    <td>tentatively ready</td>
     <td>friend declarations and module linkage</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -15352,7 +15352,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2591">
     <td><a href="https://cplusplus.github.io/CWG/issues/2591.html">2591</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Implicit change of active union member for anonymous union in union</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15376,7 +15376,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2595">
     <td><a href="https://cplusplus.github.io/CWG/issues/2595.html">2595</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>"More constrained" for eligible special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15406,7 +15406,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2600">
     <td><a href="https://cplusplus.github.io/CWG/issues/2600.html">2600</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Type dependency of placeholder types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15574,7 +15574,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2628">
     <td><a href="https://cplusplus.github.io/CWG/issues/2628.html">2628</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Implicit deduction guides should propagate constraints</td>
     <td class="none" align="center">No</td>
   </tr>
@@ -15610,7 +15610,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2634">
     <td><a href="https://cplusplus.github.io/CWG/issues/2634.html">2634</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Avoid circularity in specification of scope for friend class declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15628,13 +15628,13 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2637">
     <td><a href="https://cplusplus.github.io/CWG/issues/2637.html">2637</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Injected-class-name as a <I>simple-template-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2638">
     <td><a href="https://cplusplus.github.io/CWG/issues/2638.html">2638</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Improve the example for initializing by initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15748,7 +15748,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2657">
     <td><a href="https://cplusplus.github.io/CWG/issues/2657.html">2657</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Cv-qualification adjustment when binding reference to temporary</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15770,11 +15770,11 @@ and <I>POD class</I></td>
     <td>Confusing term "this parameter"</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2661">
+  <tr id="2661">
     <td><a href="https://cplusplus.github.io/CWG/issues/2661.html">2661</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Missing disambiguation rule for <I>pure-specifier</I> vs. <I>brace-or-equal-initializer</I></td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2662">
     <td><a href="https://cplusplus.github.io/CWG/issues/2662.html">2662</a></td>
@@ -15814,7 +15814,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2668">
     <td><a href="https://cplusplus.github.io/CWG/issues/2668.html">2668</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td><TT>co_await</TT> in a <I>lambda-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -15838,7 +15838,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2672">
     <td><a href="https://cplusplus.github.io/CWG/issues/2672.html">2672</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Lambda body SFINAE is still required, contrary to intent and note</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
@@ -15940,7 +15940,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2689">
     <td><a href="https://cplusplus.github.io/CWG/issues/2689.html">2689</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Are cv-qualified <TT>std::nullptr_t</TT> fundamental types?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16004,11 +16004,11 @@ and <I>POD class</I></td>
     <td>Inconsistency of <I>throw-expression</I> specification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2700">
+  <tr id="2700">
     <td><a href="https://cplusplus.github.io/CWG/issues/2700.html">2700</a></td>
-    <td>review</td>
+    <td>DR</td>
     <td><TT>#error</TT> disallows existing implementation practice</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2701">
     <td><a href="https://cplusplus.github.io/CWG/issues/2701.html">2701</a></td>
@@ -16048,7 +16048,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2707">
     <td><a href="https://cplusplus.github.io/CWG/issues/2707.html">2707</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Deduction guides cannot have a trailing <I>requires-clause</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16088,11 +16088,11 @@ and <I>POD class</I></td>
     <td>Initialization of reference-to-aggregate from designated initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2714">
+  <tr id="2714">
     <td><a href="https://cplusplus.github.io/CWG/issues/2714.html">2714</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Implicit deduction guides omit properties from the parameter-declaration-clause of a constructor</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2715">
     <td><a href="https://cplusplus.github.io/CWG/issues/2715.html">2715</a></td>
@@ -16156,7 +16156,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2725">
     <td><a href="https://cplusplus.github.io/CWG/issues/2725.html">2725</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Overload resolution for non-call of class member access</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16174,7 +16174,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2728">
     <td><a href="https://cplusplus.github.io/CWG/issues/2728.html">2728</a></td>
-    <td>open</td>
+    <td>tentatively ready</td>
     <td>Evaluation of conversions in a <I>delete-expression</I></td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16204,7 +16204,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2733">
     <td><a href="https://cplusplus.github.io/CWG/issues/2733.html">2733</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Applying <TT>[[maybe_unused]]</TT> to a label</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16228,7 +16228,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2737">
     <td><a href="https://cplusplus.github.io/CWG/issues/2737.html">2737</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Temporary lifetime extension for reference init-captures</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16258,7 +16258,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2742">
     <td><a href="https://cplusplus.github.io/CWG/issues/2742.html">2742</a></td>
-    <td>open</td>
+    <td>drafting</td>
     <td>Guaranteed copy elision for brace-initialization from prvalue</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16274,33 +16274,33 @@ and <I>POD class</I></td>
     <td>Multiple objects of the same type at the same address</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2745">
+  <tr id="2745">
     <td><a href="https://cplusplus.github.io/CWG/issues/2745.html">2745</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Dependent odr-use in generic lambdas</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2746">
+  <tr id="2746">
     <td><a href="https://cplusplus.github.io/CWG/issues/2746.html">2746</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Checking of default template arguments</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2747">
     <td><a href="https://cplusplus.github.io/CWG/issues/2747.html">2747</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Cannot depend on an already-deleted splice</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2748">
     <td><a href="https://cplusplus.github.io/CWG/issues/2748.html">2748</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Accessing static data members via null pointer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2749">
     <td><a href="https://cplusplus.github.io/CWG/issues/2749.html">2749</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Treatment of "pointer to void" for relational comparisons</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16324,19 +16324,19 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2753">
     <td><a href="https://cplusplus.github.io/CWG/issues/2753.html">2753</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Storage reuse for string literal objects and backing arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2754">
     <td><a href="https://cplusplus.github.io/CWG/issues/2754.html">2754</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Using *this in explicit object member functions that are coroutines</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2755">
     <td><a href="https://cplusplus.github.io/CWG/issues/2755.html">2755</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Incorrect wording applied by P2738R1</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16354,43 +16354,43 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2758">
     <td><a href="https://cplusplus.github.io/CWG/issues/2758.html">2758</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>What is "access and ambiguity control"?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2759">
     <td><a href="https://cplusplus.github.io/CWG/issues/2759.html">2759</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>[[no_unique_address] and common initial sequence</td>
     <td class="unreleased" align="center">Clang 19</td>
   </tr>
   <tr id="2760">
     <td><a href="https://cplusplus.github.io/CWG/issues/2760.html">2760</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Defaulted constructor that is an immediate function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2761">
     <td><a href="https://cplusplus.github.io/CWG/issues/2761.html">2761</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Implicitly invoking the deleted destructor of an anonymous union member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2762">
     <td><a href="https://cplusplus.github.io/CWG/issues/2762.html">2762</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Type of implicit object parameter</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2763">
     <td><a href="https://cplusplus.github.io/CWG/issues/2763.html">2763</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Ignorability of [[noreturn]] during constant evaluation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2764">
     <td><a href="https://cplusplus.github.io/CWG/issues/2764.html">2764</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Use of placeholders affecting name mangling</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16403,7 +16403,8 @@ and <I>POD class</I></td>
   <tr class="open" id="2766">
     <td><a href="https://cplusplus.github.io/CWG/issues/2766.html">2766</a></td>
     <td>open</td>
-    <td>Repeated evaluation of a <I>string-literal</I> may yield different objects</td>
+    <td>Repeated evaluation of a <I>string-literal</I> may yield different
+objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2767">
@@ -16414,7 +16415,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2768">
     <td><a href="https://cplusplus.github.io/CWG/issues/2768.html">2768</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Assignment to enumeration variable with a <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16430,15 +16431,15 @@ and <I>POD class</I></td>
     <td>Trailing <I>requires-clause</I> can refer to function parameters before they are substituted into</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2771">
+  <tr id="2771">
     <td><a href="https://cplusplus.github.io/CWG/issues/2771.html">2771</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Transformation for <I>unqualified-id</I>s in address operator</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2772">
     <td><a href="https://cplusplus.github.io/CWG/issues/2772.html">2772</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Missing Annex C entry for linkage effects of <I>linkage-specification</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16456,7 +16457,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2775">
     <td><a href="https://cplusplus.github.io/CWG/issues/2775.html">2775</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Unclear argument type for copy of exception object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16466,15 +16467,15 @@ and <I>POD class</I></td>
     <td>Substitution failure and implementation limits</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2777">
+  <tr id="2777">
     <td><a href="https://cplusplus.github.io/CWG/issues/2777.html">2777</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Type of <I>id-expression</I> denoting a template parameter object</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2778">
     <td><a href="https://cplusplus.github.io/CWG/issues/2778.html">2778</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Trivial destructor does not imply constant destruction</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16486,7 +16487,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2780">
     <td><a href="https://cplusplus.github.io/CWG/issues/2780.html">2780</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td><TT>reinterpret_cast</TT> to reference to function types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16504,7 +16505,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2783">
     <td><a href="https://cplusplus.github.io/CWG/issues/2783.html">2783</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Handling of deduction guides in <I>global-module-fragment</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16516,7 +16517,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2785">
     <td><a href="https://cplusplus.github.io/CWG/issues/2785.html">2785</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Type-dependence of <I>requires-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16540,7 +16541,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2789">
     <td><a href="https://cplusplus.github.io/CWG/issues/2789.html">2789</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Overload resolution with implicit and explicit object member functions</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
@@ -16552,19 +16553,19 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2791">
     <td><a href="https://cplusplus.github.io/CWG/issues/2791.html">2791</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Unclear phrasing about "returning to the caller"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2792">
     <td><a href="https://cplusplus.github.io/CWG/issues/2792.html">2792</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Clean up specification of <TT>noexcept</TT> operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2793">
     <td><a href="https://cplusplus.github.io/CWG/issues/2793.html">2793</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Block-scope declaration conflicting with parameter name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16576,25 +16577,25 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2795">
     <td><a href="https://cplusplus.github.io/CWG/issues/2795.html">2795</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Overlapping empty subobjects with different cv-qualification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2796">
     <td><a href="https://cplusplus.github.io/CWG/issues/2796.html">2796</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Function pointer conversions for relational operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2797">
     <td><a href="https://cplusplus.github.io/CWG/issues/2797.html">2797</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Meaning of "corresponds" for rewritten operator candidates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2798">
     <td><a href="https://cplusplus.github.io/CWG/issues/2798.html">2798</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Manifestly constant evaluation of the <TT>static_assert</TT> message</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
@@ -16612,7 +16613,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2801">
     <td><a href="https://cplusplus.github.io/CWG/issues/2801.html">2801</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Reference binding with reference-related types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16624,7 +16625,7 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2803">
     <td><a href="https://cplusplus.github.io/CWG/issues/2803.html">2803</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Overload resolution for reference binding of similar types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16642,13 +16643,13 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2806">
     <td><a href="https://cplusplus.github.io/CWG/issues/2806.html">2806</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Make a <I>type-requirement</I> a type-only context</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2807">
     <td><a href="https://cplusplus.github.io/CWG/issues/2807.html">2807</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Destructors declared <TT>consteval</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16660,19 +16661,19 @@ and <I>POD class</I></td>
   </tr>
   <tr id="2809">
     <td><a href="https://cplusplus.github.io/CWG/issues/2809.html">2809</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>An implicit definition does not redeclare a function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2810">
     <td><a href="https://cplusplus.github.io/CWG/issues/2810.html">2810</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Requiring the absence of diagnostics for templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2811">
     <td><a href="https://cplusplus.github.io/CWG/issues/2811.html">2811</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Clarify "use" of main</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
@@ -16682,11 +16683,11 @@ and <I>POD class</I></td>
     <td>Allocation with explicit alignment</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2813">
+  <tr id="2813">
     <td><a href="https://cplusplus.github.io/CWG/issues/2813.html">2813</a></td>
-    <td>review</td>
+    <td>DR</td>
     <td>Class member access with prvalues</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2814">
     <td><a href="https://cplusplus.github.io/CWG/issues/2814.html">2814</a></td>
@@ -16714,57 +16715,57 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2818">
     <td><a href="https://cplusplus.github.io/CWG/issues/2818.html">2818</a></td>
-    <td>review</td>
+    <td>tentatively ready</td>
     <td>Use of predefined reserved identifiers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2819">
     <td><a href="https://cplusplus.github.io/CWG/issues/2819.html">2819</a></td>
-    <td>review</td>
+    <td>tentatively ready</td>
     <td>Cast from null pointer value in a constant expression</td>
-    <td align="center">Not resolved</td>
+    <td title="Clang 19 implements 2023-12-01 resolution" align="center">Not Resolved*</td>
   </tr>
-  <tr class="open" id="2820">
+  <tr id="2820">
     <td><a href="https://cplusplus.github.io/CWG/issues/2820.html">2820</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Value-initialization and default constructors</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2821">
     <td><a href="https://cplusplus.github.io/CWG/issues/2821.html">2821</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Lifetime, zero-initialization, and dynamic initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2822">
     <td><a href="https://cplusplus.github.io/CWG/issues/2822.html">2822</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Side-effect-free pointer zap</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2823">
     <td><a href="https://cplusplus.github.io/CWG/issues/2823.html">2823</a></td>
-    <td>DR</td>
+    <td>DRWP</td>
     <td>Implicit undefined behavior when dereferencing pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2824">
     <td><a href="https://cplusplus.github.io/CWG/issues/2824.html">2824</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Copy-initialization of arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2825">
     <td><a href="https://cplusplus.github.io/CWG/issues/2825.html">2825</a></td>
-    <td>tentatively ready</td>
+    <td>DR</td>
     <td>Range-based for statement using a <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr id="2826">
+  <tr class="open" id="2826">
     <td><a href="https://cplusplus.github.io/CWG/issues/2826.html">2826</a></td>
-    <td>tentatively ready</td>
+    <td>drafting</td>
     <td>Missing definition of "temporary expression"</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2827">
     <td><a href="https://cplusplus.github.io/CWG/issues/2827.html">2827</a></td>
@@ -16772,11 +16773,11 @@ and <I>POD class</I></td>
     <td>Representation of unsigned integral types</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2828">
+  <tr id="2828">
     <td><a href="https://cplusplus.github.io/CWG/issues/2828.html">2828</a></td>
-    <td>review</td>
+    <td>DR</td>
     <td>Ambiguous interpretation of C-style cast</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2829">
     <td><a href="https://cplusplus.github.io/CWG/issues/2829.html">2829</a></td>
@@ -16784,17 +16785,17 @@ and <I>POD class</I></td>
     <td>Redundant case in restricting user-defined conversion sequences</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2830">
+  <tr id="2830">
     <td><a href="https://cplusplus.github.io/CWG/issues/2830.html">2830</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Top-level cv-qualification should be ignored for list-initialization</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
-  <tr class="open" id="2831">
+  <tr id="2831">
     <td><a href="https://cplusplus.github.io/CWG/issues/2831.html">2831</a></td>
-    <td>open</td>
+    <td>DR</td>
     <td>Non-templated function definitions and <I>requires-clause</I>s</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2832">
     <td><a href="https://cplusplus.github.io/CWG/issues/2832.html">2832</a></td>
@@ -16810,7 +16811,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2834">
     <td><a href="https://cplusplus.github.io/CWG/issues/2834.html">2834</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Partial ordering and explicit object parameters</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16822,7 +16823,7 @@ and <I>POD class</I></td>
   </tr>
   <tr class="open" id="2836">
     <td><a href="https://cplusplus.github.io/CWG/issues/2836.html">2836</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Conversion rank of <TT>long double</TT> and extended floating-point types</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -16855,6 +16856,276 @@ and <I>POD class</I></td>
     <td>open</td>
     <td>When do const objects start being const?</td>
     <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2842">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2842.html">2842</a></td>
+    <td>open</td>
+    <td>Preferring an <TT>initializer_list</TT> over a single value</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2843">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2843.html">2843</a></td>
+    <td>review</td>
+    <td>Undated reference to Unicode makes C++ a moving target</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2844">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2844.html">2844</a></td>
+    <td>open</td>
+    <td>Enumerating a finite set of built-in candidates</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr id="2845">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2845.html">2845</a></td>
+    <td>DR</td>
+    <td>Make the closure type of a captureless lambda a structural type</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2846">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2846.html">2846</a></td>
+    <td>DR</td>
+    <td>Out-of-class definitions of explicit object member functions</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr class="open" id="2847">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2847.html">2847</a></td>
+    <td>review</td>
+    <td>Constrained explicit specializations of function templates at class scope</td>
+    <td title="Clang 19 implements 2024-03-01 resolution" align="center">Not Resolved*</td>
+  </tr>
+  <tr id="2848">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2848.html">2848</a></td>
+    <td>DR</td>
+    <td>Omitting an empty template argument list for explicit instantiation</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2849">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2849.html">2849</a></td>
+    <td>DR</td>
+    <td>Parameter objects are not temporary objects</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2850">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2850.html">2850</a></td>
+    <td>DR</td>
+    <td>Unclear storage duration for function parameter objects</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2851">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2851.html">2851</a></td>
+    <td>DR</td>
+    <td>Allow floating-point conversions in converted constant expressions</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr class="open" id="2852">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2852.html">2852</a></td>
+    <td>open</td>
+    <td>Complete-class contexts and class-scope lambdas</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr id="2853">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2853.html">2853</a></td>
+    <td>DR</td>
+    <td>Pointer arithmetic with pointer to hypothetical element</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2854">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2854.html">2854</a></td>
+    <td>DR</td>
+    <td>Storage duration of exception objects</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2855">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2855.html">2855</a></td>
+    <td>DR</td>
+    <td>Undefined behavior in postfix increment</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2856">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2856.html">2856</a></td>
+    <td>DR</td>
+    <td>Copy-list-initialization with explicit default constructors</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr id="2857">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2857.html">2857</a></td>
+    <td>DR</td>
+    <td>Argument-dependent lookup with incomplete class types</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr class="open" id="2858">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2858.html">2858</a></td>
+    <td>tentatively ready</td>
+    <td>Declarative <I>nested-name-specifier</I>s and <I>pack-index-specifier</I>s</td>
+    <td title="Clang 19 implements 2024-04-05 resolution" align="center">Not Resolved*</td>
+  </tr>
+  <tr class="open" id="2859">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2859.html">2859</a></td>
+    <td>tentatively ready</td>
+    <td>Value-initialization with multiple default constructors</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr id="2860">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2860.html">2860</a></td>
+    <td>dup</td>
+    <td>Remove and fix the term "vacuous initialization"</td>
+    <td class="unknown" align="center">Unknown</td>
+  </tr>
+  <tr class="open" id="2861">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2861.html">2861</a></td>
+    <td>tentatively ready</td>
+    <td><TT>dynamic_cast</TT> on bad pointer value</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2862">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2862.html">2862</a></td>
+    <td>tentatively ready</td>
+    <td>Unclear boundaries of template declarations</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2863">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2863.html">2863</a></td>
+    <td>tentatively ready</td>
+    <td>Unclear synchronization requirements for object lifetime rules</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2864">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2864.html">2864</a></td>
+    <td>tentatively ready</td>
+    <td>Narrowing floating-point conversions</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2865">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2865.html">2865</a></td>
+    <td>open</td>
+    <td>Regression on result of conditional operator</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2866">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2866.html">2866</a></td>
+    <td>open</td>
+    <td>Observing the effects of <TT>[[no_unique_address]]</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2867">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2867.html">2867</a></td>
+    <td>open</td>
+    <td>Order of initialization for structured bindings</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2868">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2868.html">2868</a></td>
+    <td>open</td>
+    <td>Self-references in trivially copyable objects as function return values</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2869">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2869.html">2869</a></td>
+    <td>open</td>
+    <td><TT>this</TT> in local classes</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2870">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2870.html">2870</a></td>
+    <td>open</td>
+    <td>Combining absent <I>encoding-prefix</I>es</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2871">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2871.html">2871</a></td>
+    <td>tentatively ready</td>
+    <td>User-declared constructor templates inhibiting default constructors</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2872">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2872.html">2872</a></td>
+    <td>open</td>
+    <td>Linkage and unclear "can be referred to"</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2873">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2873.html">2873</a></td>
+    <td>open</td>
+    <td>Taking the address of a function involving template argument deduction</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2874">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2874.html">2874</a></td>
+    <td>open</td>
+    <td>Qualified declarations of partial specializations</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2875">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2875.html">2875</a></td>
+    <td>open</td>
+    <td>Missing support for round-tripping nullptr through indirection/address operators</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2876">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2876.html">2876</a></td>
+    <td>open</td>
+    <td>Disambiguation of <TT>T x = delete("text")</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2877">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2877.html">2877</a></td>
+    <td>open</td>
+    <td>Type-only lookup for <I>using-enum-declarator</I></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2878">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2878.html">2878</a></td>
+    <td>open</td>
+    <td>C-style casts to reference types</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2879">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2879.html">2879</a></td>
+    <td>open</td>
+    <td>Undesired outcomes with <TT>const_cast</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2880">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2880.html">2880</a></td>
+    <td>open</td>
+    <td>Accessibility check for destructor of incomplete class type</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2881">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2881.html">2881</a></td>
+    <td>open</td>
+    <td>Type restrictions for the explicit object parameter of a lambda</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2882">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2882.html">2882</a></td>
+    <td>open</td>
+    <td>Unclear treatment of conversion to <TT>void</TT></td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2883">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2883.html">2883</a></td>
+    <td>open</td>
+    <td>Definition of "odr-usable" ignores lambda scopes</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2884">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2884.html">2884</a></td>
+    <td>open</td>
+    <td>Qualified declarations of partial specializations</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2885">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2885.html">2885</a></td>
+    <td>open</td>
+    <td>Non-eligible trivial default constructors</td>
+    <td align="center">Not resolved</td>
+  </tr>
+  <tr class="open" id="2886">
+    <td><a href="https://cplusplus.github.io/CWG/issues/2886.html">2886</a></td>
+    <td>open</td>
+    <td>Temporaries and trivial potentially-throwing special member functions</td>
+    <td align="center">Not resolved</td>
   </tr></table>
 
 </div>
diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index c233171e63c8..d58c35b72c22 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -167,7 +167,7 @@ C++23, informally referred to as C++26.</p>
  <tr>
   <td>Disallow Binding a Returned Glvalue to a Temporary</td>
   <td><a href="https://wg21.link/P2748R5">P2748R5</a></td>
-  <td class="none" align="center">No</td>
+  <td class="full" align="center">Clang 19</td>
  </tr>
  <tr>
   <td>Clarifying rules for brace elision in aggregate initialization</td>
@@ -187,7 +187,7 @@ C++23, informally referred to as C++26.</p>
  <tr>
   <td>Trivial infinite loops are not Undefined Behavior</td>
   <td><a href="https://wg21.link/P2809R3">P2809R3</a> (<a href="#dr">DR</a>)</td>
-  <td class="none" align="center">No</td>
+  <td class="unreleased" align="center">Clang 19</td>
  </tr>
  <tr>
   <td>Erroneous behaviour for uninitialized reads</td>
diff --git a/clang/www/make_cxx_dr_status b/clang/www/make_cxx_dr_status
index 7c0cf77a1524..47c8b3bae4a1 100755
--- a/clang/www/make_cxx_dr_status
+++ b/clang/www/make_cxx_dr_status
@@ -5,7 +5,7 @@ latest_release = 18
 
 clang_www_dir = os.path.dirname(__file__)
 default_issue_list_path = os.path.join(clang_www_dir, 'cwg_index.html')
-issue_list_url = "https://www.open-std.org/jtc1/sc22/wg21/docs/cwg_index.html"
+issue_list_url = "https://raw.githubusercontent.com/cplusplus/CWG/gh-pages/issues/cwg_index.html"
 output = os.path.join(clang_www_dir, 'cxx_dr_status.html')
 dr_test_dir = os.path.join(clang_www_dir, '../test/CXX/drs')
 
@@ -138,10 +138,10 @@ def availability(issue):
 
   unresolved_status = ''
   proposed_resolution = ''
-  unresolved_status_match = re.search(r' (open|drafting|review)', status)
+  unresolved_status_match = re.search(r' (open|drafting|review|tentatively ready)', status)
   if unresolved_status_match:
     unresolved_status = unresolved_status_match.group(1)
-    proposed_resolution_match = re.search(r' (open|drafting|review) (\d{4}-\d{2}(?:-\d{2})?|P\d{4}R\d+)$', status)
+    proposed_resolution_match = re.search(r' (open|drafting|review|tentatively ready) (\d{4}-\d{2}(?:-\d{2})?|P\d{4}R\d+)$', status)
     if proposed_resolution_match is None:
       raise AvailabilityError('Issue {}: \'{}\' status should be followed by a paper number (P1234R5) or proposed resolution in YYYY-MM-DD format'.format(dr.issue, unresolved_status))
     proposed_resolution = proposed_resolution_match.group(2)
@@ -236,7 +236,7 @@ for dr in drs:
     avail = 'Extension'
     avail_style = ''
 
-  elif dr.status in ('open', 'drafting', 'review'):
+  elif dr.status in ('open', 'drafting', 'review', 'tentatively ready'):
     row_style = ' class="open"'
     try:
       avail, avail_style, unresolved_status = availability(dr.issue)
diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.def b/compiler-rt/lib/scudo/standalone/allocator_config.def
index 9691a007eed5..dcd130ac449a 100644
--- a/compiler-rt/lib/scudo/standalone/allocator_config.def
+++ b/compiler-rt/lib/scudo/standalone/allocator_config.def
@@ -89,6 +89,7 @@ PRIMARY_REQUIRED(const s32, MaxReleaseToOsIntervalMs)
 // Indicates support for offsetting the start of a region by a random number of
 // pages. This is only used if `EnableContiguousRegions` is enabled.
 PRIMARY_OPTIONAL(const bool, EnableRandomOffset, false)
+PRIMARY_OPTIONAL(const s32, DefaultReleaseToOsIntervalMs, INT32_MIN)
 
 // When `EnableContiguousRegions` is true, all regions will be be arranged in
 // adjacency. This will reduce the fragmentation caused by region allocations
@@ -118,6 +119,7 @@ SECONDARY_CACHE_OPTIONAL(const u32, DefaultMaxEntriesCount, 0)
 SECONDARY_CACHE_OPTIONAL(const uptr, DefaultMaxEntrySize, 0)
 SECONDARY_CACHE_OPTIONAL(const s32, MinReleaseToOsIntervalMs, INT32_MIN)
 SECONDARY_CACHE_OPTIONAL(const s32, MaxReleaseToOsIntervalMs, INT32_MAX)
+SECONDARY_CACHE_OPTIONAL(const s32, DefaultReleaseToOsIntervalMs, INT32_MIN)
 
 #undef SECONDARY_CACHE_OPTIONAL
 #undef SECONDARY_REQUIRED_TEMPLATE_TYPE
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index e7bc90cd0960..927513dea92d 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -173,6 +173,9 @@ public:
         static_cast<u32>(getFlags()->quarantine_max_chunk_size);
 
     Stats.init();
+    // TODO(chiahungduan): Given that we support setting the default value in
+    // the PrimaryConfig and CacheConfig, consider to deprecate the use of
+    // `release_to_os_interval_ms` flag.
     const s32 ReleaseToOsIntervalMs = getFlags()->release_to_os_interval_ms;
     Primary.init(ReleaseToOsIntervalMs);
     Secondary.init(&Stats, ReleaseToOsIntervalMs);
diff --git a/compiler-rt/lib/scudo/standalone/flags.inc b/compiler-rt/lib/scudo/standalone/flags.inc
index f5a2bab5057a..ff0c28e1db7c 100644
--- a/compiler-rt/lib/scudo/standalone/flags.inc
+++ b/compiler-rt/lib/scudo/standalone/flags.inc
@@ -42,7 +42,7 @@ SCUDO_FLAG(bool, may_return_null, true,
            "returning NULL in otherwise non-fatal error scenarios, eg: OOM, "
            "invalid allocation alignments, etc.")
 
-SCUDO_FLAG(int, release_to_os_interval_ms, SCUDO_ANDROID ? INT32_MIN : 5000,
+SCUDO_FLAG(int, release_to_os_interval_ms, 5000,
            "Interval (in milliseconds) at which to attempt release of unused "
            "memory to the OS. Negative values disable the feature.")
 
diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
index 1d8a77b73e5c..ebfb8dfe0a31 100644
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -88,6 +88,10 @@ public:
       Sci->MinRegionIndex = NumRegions;
       Sci->ReleaseInfo.LastReleaseAtNs = Time;
     }
+
+    // The default value in the primary config has the higher priority.
+    if (Config::getDefaultReleaseToOsIntervalMs() != INT32_MIN)
+      ReleaseToOsInterval = Config::getDefaultReleaseToOsIntervalMs();
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
 
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index 61d57976ae43..bed2ccb8b992 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -147,6 +147,9 @@ public:
     for (uptr I = 0; I < NumClasses; I++)
       getRegionInfo(I)->FLLockCV.bindTestOnly(getRegionInfo(I)->FLLock);
 
+    // The default value in the primary config has the higher priority.
+    if (Config::getDefaultReleaseToOsIntervalMs() != INT32_MIN)
+      ReleaseToOsInterval = Config::getDefaultReleaseToOsIntervalMs();
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
 
@@ -884,9 +887,10 @@ private:
         ScopedLock ML(Region->MMLock);
 
         const bool RegionIsExhausted = Region->Exhausted;
-        if (!RegionIsExhausted)
+        if (!RegionIsExhausted) {
           PopCount = populateFreeListAndPopBlocks(C, ClassId, Region, ToArray,
                                                   MaxBlockCount);
+        }
         ReportRegionExhausted = !RegionIsExhausted && Region->Exhausted;
 
         {
@@ -1019,7 +1023,6 @@ private:
                                           MAP_ALLOWNOMEM))) {
         Printf("Can't reserve pages for size class %zu.\n",
                getSizeByClassId(ClassId));
-        Region->Exhausted = true;
         return 0U;
       }
       initRegion(Region, ClassId,
diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index 674af5071775..d8c9f5bcfcaf 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -209,6 +209,9 @@ public:
               static_cast<sptr>(Config::getDefaultMaxEntriesCount()));
     setOption(Option::MaxCacheEntrySize,
               static_cast<sptr>(Config::getDefaultMaxEntrySize()));
+    // The default value in the cache config has the higher priority.
+    if (Config::getDefaultReleaseToOsIntervalMs() != INT32_MIN)
+      ReleaseToOsInterval = Config::getDefaultReleaseToOsIntervalMs();
     setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
   }
 
diff --git a/compiler-rt/lib/scudo/standalone/wrappers_c.inc b/compiler-rt/lib/scudo/standalone/wrappers_c.inc
index 21d5b7add512..59f3fb0962f8 100644
--- a/compiler-rt/lib/scudo/standalone/wrappers_c.inc
+++ b/compiler-rt/lib/scudo/standalone/wrappers_c.inc
@@ -252,13 +252,11 @@ INTERFACE WEAK int SCUDO_PREFIX(mallopt)(int param, int value) {
       // introduced by interval transition.
       SCUDO_ALLOCATOR.releaseToOS(scudo::ReleaseToOS::Force);
 
-      if (value == 0) {
-        // Will set the release values to their minimum values.
-        value = INT32_MIN;
-      } else {
-        // Will set the release values to their maximum values.
+      // The values allowed on Android are {-1, 0, 1}. "1" means the longest
+      // interval.
+      CHECK(value >= -1 && value <= 1);
+      if (value == 1)
         value = INT32_MAX;
-      }
     }
 
     SCUDO_ALLOCATOR.setOption(scudo::Option::ReleaseInterval,
diff --git a/compiler-rt/test/asan/TestCases/Darwin/odr-lto.cpp b/compiler-rt/test/asan/TestCases/Darwin/odr-lto.cpp
index 90c16776a63b..8d400800fe93 100644
--- a/compiler-rt/test/asan/TestCases/Darwin/odr-lto.cpp
+++ b/compiler-rt/test/asan/TestCases/Darwin/odr-lto.cpp
@@ -5,7 +5,7 @@
 
 // RUN: %clangxx_asan -DPART=0 -c %s -o %t-1.o -flto -mllvm -asan-use-private-alias
 // RUN: %clangxx_asan -DPART=1 -c %s -o %t-2.o -flto -mllvm -asan-use-private-alias
-// RUN: %clangxx_asan_lto %t-1.o %t-2.o -o %t -flto -mlinker-version=133
+// RUN: %clangxx_asan_lto %t-1.o %t-2.o -o %t -flto
 // RUN: %run %t 2>&1 | FileCheck %s
 
 #include <stdio.h>
diff --git a/flang/cmake/modules/AddFlangOffloadRuntime.cmake b/flang/cmake/modules/AddFlangOffloadRuntime.cmake
index e34d3851187a..0af12c8cfd54 100644
--- a/flang/cmake/modules/AddFlangOffloadRuntime.cmake
+++ b/flang/cmake/modules/AddFlangOffloadRuntime.cmake
@@ -2,6 +2,10 @@ option(FLANG_EXPERIMENTAL_CUDA_RUNTIME
   "Compile Fortran runtime as CUDA sources (experimental)" OFF
   )
 
+option(FLANG_CUDA_RUNTIME_PTX_WITHOUT_GLOBAL_VARS
+  "Do not compile global variables' definitions when producing PTX library" OFF
+  )
+
 set(FLANG_LIBCUDACXX_PATH "" CACHE PATH "Path to libcu++ package installation")
 
 set(FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD "off" CACHE STRING
@@ -56,6 +60,11 @@ macro(enable_cuda_compilation name files)
     # Add an OBJECT library consisting of CUDA PTX.
     llvm_add_library(${name}PTX OBJECT PARTIAL_SOURCES_INTENDED ${files})
     set_property(TARGET obj.${name}PTX PROPERTY CUDA_PTX_COMPILATION ON)
+    if (FLANG_CUDA_RUNTIME_PTX_WITHOUT_GLOBAL_VARS)
+      target_compile_definitions(obj.${name}PTX
+        PRIVATE FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
+        )
+    endif()
   endif()
 endmacro()
 
diff --git a/flang/docs/FlangDriver.md b/flang/docs/FlangDriver.md
index ac120b4ff09b..351595ac0afd 100644
--- a/flang/docs/FlangDriver.md
+++ b/flang/docs/FlangDriver.md
@@ -179,46 +179,20 @@ like this:
 
 ```
 $ flang -v -o example example.o
-"/usr/bin/ld" [...] example.o [...] "--whole-archive" "-lFortran_main"
-"--no-whole-archive" "-lFortranRuntime" "-lFortranDecimal" [...]
+"/usr/bin/ld" [...] example.o [...] "-lFortranRuntime" "-lFortranDecimal" [...]
 ```
 
 The automatically added libraries are:
 
-* `Fortran_main`: Provides the main entry point `main` that then invokes
-  `_QQmain` with the Fortran program unit.  This library has a dependency to
-  the `FortranRuntime` library.
 * `FortranRuntime`: Provides most of the Flang runtime library.
 * `FortranDecimal`: Provides operations for decimal numbers.
 
-The default is that, when using Flang as the linker, one of the Fortran
-translation units provides the program unit and therefore it is assumed that
-Fortran is the main code part (calling into C/C++ routines via `BIND (C)`
-interfaces).  When composing the linker commandline, Flang uses
-`--whole-archive` and `--no-whole-archive` (Windows: `/WHOLEARCHIVE:`,
-Darwin & AIX: *not implemented yet*) to make sure that all for `Fortran_main`
-is processed by the linker.  This is done to issue a proper error message when
-multiple definitions of `main` occur.  This happens, for instance, when linking
-a code that has a Fortran program unit with a C/C++ code that also defines a
-`main` function.  A user may be required to explicitly provide the C++ runtime
-libraries at link time (e.g., via `-lstdc++` for STL)
-
 If the code is C/C++ based and invokes Fortran routines, one can either use Clang
 or Flang as the linker driver.  If Clang is used, it will automatically all
 required runtime libraries needed by C++ (e.g., for STL) to the linker invocation.
 In this case, one has to explicitly provide the Fortran runtime libraries
-`FortranRuntime` and/or `FortranDecimal`.  An alternative is to use Flang to link
-and use the `-fno-fortran-main` flag.  This flag removes
-`Fortran_main` from the linker stage and hence requires one of the C/C++
-translation units to provide a definition of the `main` function. In this case,
-it may be required to explicitly supply C++ runtime libraries as mentioned above.
-
-When creating shared or static libraries using Flang with `-shared` or `-static`
-flag, Fortran_main is automatically removed from the linker stage (i.e.,
-`-fno-fortran-main` is on by default).  It is assumed that when creating a
-static or shared library, the generated library does not need a `main`
-function, as a final link stage will occur that will provide the `Fortran_main`
-library when creating the final executable.
+`FortranRuntime` and/or `FortranDecimal`.  An alternative is to use Flang to link.
+In this case, it may be required to explicitly supply C++ runtime libraries.
 
 On Darwin, the logical root where the system libraries are located (sysroot)
 must be specified. This can be done with the CMake build flag `DEFAULT_SYSROOT`
diff --git a/flang/include/flang/Common/visit.h b/flang/include/flang/Common/visit.h
index d672ed49b936..d867338be7e0 100644
--- a/flang/include/flang/Common/visit.h
+++ b/flang/include/flang/Common/visit.h
@@ -88,7 +88,7 @@ inline RT_API_ATTRS auto visit(VISITOR &&visitor, VARIANT &&...u)
 // Some versions of clang have bugs that cause compilation to hang
 // on these templates.  MSVC and older GCC versions may work but are
 // not well tested.  So enable only for GCC 9 and better.
-#if __GNUC__ < 9
+#if __GNUC__ < 9 && !defined(__clang__)
 #define FLANG_USE_STD_VISIT
 #endif
 
diff --git a/flang/include/flang/Lower/Mangler.h b/flang/include/flang/Lower/Mangler.h
index 41939abe29e5..99da96b0d6ba 100644
--- a/flang/include/flang/Lower/Mangler.h
+++ b/flang/include/flang/Lower/Mangler.h
@@ -90,7 +90,7 @@ inline std::string mangleArrayLiteral(
   return mangleArrayLiteral(x.values().size() * sizeof(x.values()[0]),
                             x.shape(), Fortran::common::TypeCategory::Derived,
                             /*kind=*/0, /*charLen=*/-1,
-                            eleTy.cast<fir::RecordType>().getName());
+                            mlir::cast<fir::RecordType>(eleTy).getName());
 }
 
 /// Return the compiler-generated name of a static namelist variable descriptor.
diff --git a/flang/include/flang/Optimizer/Analysis/TBAAForest.h b/flang/include/flang/Optimizer/Analysis/TBAAForest.h
index b69e50bbe05c..619ed4939c51 100644
--- a/flang/include/flang/Optimizer/Analysis/TBAAForest.h
+++ b/flang/include/flang/Optimizer/Analysis/TBAAForest.h
@@ -88,7 +88,7 @@ public:
     // name must be used so that we add to the tbaa tree added in the FIR pass
     mlir::Attribute attr = func->getAttr(getInternalFuncNameAttrName());
     if (attr) {
-      return getFuncTree(attr.cast<mlir::StringAttr>());
+      return getFuncTree(mlir::cast<mlir::StringAttr>(attr));
     }
     return getFuncTree(func.getSymNameAttr());
   }
diff --git a/flang/include/flang/Optimizer/Builder/BoxValue.h b/flang/include/flang/Optimizer/Builder/BoxValue.h
index 2fed2d48a7a0..5c7e89dbc08f 100644
--- a/flang/include/flang/Optimizer/Builder/BoxValue.h
+++ b/flang/include/flang/Optimizer/Builder/BoxValue.h
@@ -78,7 +78,7 @@ class CharBoxValue : public AbstractBox {
 public:
   CharBoxValue(mlir::Value addr, mlir::Value len)
       : AbstractBox{addr}, len{len} {
-    if (addr && addr.getType().template isa<fir::BoxCharType>())
+    if (addr && mlir::isa<fir::BoxCharType>(addr.getType()))
       fir::emitFatalError(addr.getLoc(),
                           "BoxChar should not be in CharBoxValue");
   }
@@ -221,7 +221,7 @@ public:
     auto type = getAddr().getType();
     if (auto pointedTy = fir::dyn_cast_ptrEleTy(type))
       type = pointedTy;
-    return type.cast<fir::BaseBoxType>();
+    return mlir::cast<fir::BaseBoxType>(type);
   }
   /// Return the part of the address type after memory and box types. That is
   /// the element type, maybe wrapped in a fir.array type.
@@ -243,22 +243,22 @@ public:
   /// Get the scalar type related to the described entity
   mlir::Type getEleTy() const {
     auto type = getBaseTy();
-    if (auto seqTy = type.dyn_cast<fir::SequenceType>())
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(type))
       return seqTy.getEleTy();
     return type;
   }
 
   /// Is the entity an array or an assumed rank ?
-  bool hasRank() const { return getBaseTy().isa<fir::SequenceType>(); }
+  bool hasRank() const { return mlir::isa<fir::SequenceType>(getBaseTy()); }
   /// Is this an assumed rank ?
   bool hasAssumedRank() const {
-    auto seqTy = getBaseTy().dyn_cast<fir::SequenceType>();
+    auto seqTy = mlir::dyn_cast<fir::SequenceType>(getBaseTy());
     return seqTy && seqTy.hasUnknownShape();
   }
   /// Returns the rank of the entity. Beware that zero will be returned for
   /// both scalars and assumed rank.
   unsigned rank() const {
-    if (auto seqTy = getBaseTy().dyn_cast<fir::SequenceType>())
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(getBaseTy()))
       return seqTy.getDimension();
     return 0;
   }
@@ -267,7 +267,7 @@ public:
   bool isCharacter() const { return fir::isa_char(getEleTy()); }
 
   /// Is this a derived type entity ?
-  bool isDerived() const { return getEleTy().isa<fir::RecordType>(); }
+  bool isDerived() const { return mlir::isa<fir::RecordType>(getEleTy()); }
 
   bool isDerivedWithLenParameters() const {
     return fir::isRecordWithTypeParameters(getEleTy());
@@ -377,11 +377,11 @@ public:
   }
   /// Is this a Fortran pointer ?
   bool isPointer() const {
-    return getBoxTy().getEleTy().isa<fir::PointerType>();
+    return mlir::isa<fir::PointerType>(getBoxTy().getEleTy());
   }
   /// Is this an allocatable ?
   bool isAllocatable() const {
-    return getBoxTy().getEleTy().isa<fir::HeapType>();
+    return mlir::isa<fir::HeapType>(getBoxTy().getEleTy());
   }
   // Replace the fir.ref<fir.box>, keeping any non-deferred parameters.
   MutableBoxValue clone(mlir::Value newBox) const {
@@ -488,7 +488,7 @@ public:
     if (const auto *b = getUnboxed()) {
       if (*b) {
         auto type = b->getType();
-        if (type.template isa<fir::BoxCharType>())
+        if (mlir::isa<fir::BoxCharType>(type))
           fir::emitFatalError(b->getLoc(), "BoxChar should be unboxed");
         type = fir::unwrapSequenceType(fir::unwrapRefType(type));
         if (fir::isa_char(type))
diff --git a/flang/include/flang/Optimizer/Builder/Factory.h b/flang/include/flang/Optimizer/Builder/Factory.h
index ec294d26ac96..4e5c52ac44e0 100644
--- a/flang/include/flang/Optimizer/Builder/Factory.h
+++ b/flang/include/flang/Optimizer/Builder/Factory.h
@@ -43,9 +43,9 @@ template <typename B>
 void genCharacterCopy(mlir::Value src, mlir::Value srcLen, mlir::Value dst,
                       mlir::Value dstLen, B &builder, mlir::Location loc) {
   auto srcTy =
-      fir::dyn_cast_ptrEleTy(src.getType()).template cast<fir::CharacterType>();
+      mlir::cast<fir::CharacterType>(fir::dyn_cast_ptrEleTy(src.getType()));
   auto dstTy =
-      fir::dyn_cast_ptrEleTy(dst.getType()).template cast<fir::CharacterType>();
+      mlir::cast<fir::CharacterType>(fir::dyn_cast_ptrEleTy(dst.getType()));
   if (!srcLen && !dstLen && srcTy.getFKind() == dstTy.getFKind() &&
       srcTy.getLen() == dstTy.getLen()) {
     // same size, so just use load and store
@@ -61,8 +61,8 @@ void genCharacterCopy(mlir::Value src, mlir::Value srcLen, mlir::Value dst,
         fir::CharacterType::getSingleton(ty.getContext(), ty.getFKind())));
   };
   auto toEleTy = [&](fir::ReferenceType ty) {
-    auto seqTy = ty.getEleTy().cast<fir::SequenceType>();
-    return seqTy.getEleTy().cast<fir::CharacterType>();
+    auto seqTy = mlir::cast<fir::SequenceType>(ty.getEleTy());
+    return mlir::cast<fir::CharacterType>(seqTy.getEleTy());
   };
   auto toCoorTy = [&](fir::ReferenceType ty) {
     return fir::ReferenceType::get(toEleTy(ty));
@@ -190,8 +190,8 @@ originateIndices(mlir::Location loc, B &builder, mlir::Type memTy,
   if (origins.empty()) {
     assert(!shapeVal || mlir::isa<fir::ShapeOp>(shapeVal.getDefiningOp()));
     auto ty = fir::dyn_cast_ptrOrBoxEleTy(memTy);
-    assert(ty && ty.isa<fir::SequenceType>());
-    auto seqTy = ty.cast<fir::SequenceType>();
+    assert(ty && mlir::isa<fir::SequenceType>(ty));
+    auto seqTy = mlir::cast<fir::SequenceType>(ty);
     auto one = builder.template create<mlir::arith::ConstantIndexOp>(loc, 1);
     const auto dimension = seqTy.getDimension();
     if (shapeVal) {
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index 035035601e2f..6c36f7e84db6 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -77,12 +77,12 @@ public:
   /// Return the rank of this entity or -1 if it is an assumed rank.
   int getRank() const {
     mlir::Type type = fir::unwrapPassByRefType(fir::unwrapRefType(getType()));
-    if (auto seqTy = type.dyn_cast<fir::SequenceType>()) {
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(type)) {
       if (seqTy.hasUnknownShape())
         return -1;
       return seqTy.getDimension();
     }
-    if (auto exprType = type.dyn_cast<hlfir::ExprType>())
+    if (auto exprType = mlir::dyn_cast<hlfir::ExprType>(type))
       return exprType.getRank();
     return 0;
   }
@@ -99,17 +99,17 @@ public:
 
   bool hasLengthParameters() const {
     mlir::Type eleTy = getFortranElementType();
-    return eleTy.isa<fir::CharacterType>() ||
+    return mlir::isa<fir::CharacterType>(eleTy) ||
            fir::isRecordWithTypeParameters(eleTy);
   }
 
   bool isCharacter() const {
-    return getFortranElementType().isa<fir::CharacterType>();
+    return mlir::isa<fir::CharacterType>(getFortranElementType());
   }
 
   bool hasIntrinsicType() const {
     mlir::Type eleTy = getFortranElementType();
-    return fir::isa_trivial(eleTy) || eleTy.isa<fir::CharacterType>();
+    return fir::isa_trivial(eleTy) || mlir::isa<fir::CharacterType>(eleTy);
   }
 
   bool isDerivedWithLengthParameters() const {
@@ -124,8 +124,8 @@ public:
     if (auto varIface = getIfVariableInterface()) {
       if (auto shape = varIface.getShape()) {
         auto shapeTy = shape.getType();
-        return shapeTy.isa<fir::ShiftType>() ||
-               shapeTy.isa<fir::ShapeShiftType>();
+        return mlir::isa<fir::ShiftType>(shapeTy) ||
+               mlir::isa<fir::ShapeShiftType>(shapeTy);
       }
       return false;
     }
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 604f2bd969ee..b7d060926761 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -663,8 +663,8 @@ static inline mlir::FunctionType genFuncType(mlir::MLIRContext *context,
 //===----------------------------------------------------------------------===//
 static inline mlir::Type getConvertedElementType(mlir::MLIRContext *context,
                                                  mlir::Type eleTy) {
-  if (eleTy.isa<mlir::IntegerType>() && !eleTy.isSignlessInteger()) {
-    const auto intTy{eleTy.dyn_cast<mlir::IntegerType>()};
+  if (mlir::isa<mlir::IntegerType>(eleTy) && !eleTy.isSignlessInteger()) {
+    const auto intTy{mlir::dyn_cast<mlir::IntegerType>(eleTy)};
     auto newEleTy{mlir::IntegerType::get(context, intTy.getWidth())};
     return newEleTy;
   }
diff --git a/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h
index 1e87bf0f6ad1..a7c4c075d818 100644
--- a/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/PPCIntrinsicCall.h
@@ -180,10 +180,10 @@ struct VecTypeInfo {
 // Returns a VecTypeInfo with element type and length of given fir vector type.
 // Preserves signness of fir vector type if element type of integer.
 static inline VecTypeInfo getVecTypeFromFirType(mlir::Type firTy) {
-  assert(firTy.isa<fir::VectorType>());
+  assert(mlir::isa<fir::VectorType>(firTy));
   VecTypeInfo vecTyInfo;
-  vecTyInfo.eleTy = firTy.dyn_cast<fir::VectorType>().getEleTy();
-  vecTyInfo.len = firTy.dyn_cast<fir::VectorType>().getLen();
+  vecTyInfo.eleTy = mlir::dyn_cast<fir::VectorType>(firTy).getEleTy();
+  vecTyInfo.len = mlir::dyn_cast<fir::VectorType>(firTy).getLen();
   return vecTyInfo;
 }
 
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/EnvironmentDefaults.h b/flang/include/flang/Optimizer/Builder/Runtime/EnvironmentDefaults.h
index 18a24bad3960..216d3bcec137 100755
--- a/flang/include/flang/Optimizer/Builder/Runtime/EnvironmentDefaults.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/EnvironmentDefaults.h
@@ -22,6 +22,7 @@
 
 namespace fir {
 class FirOpBuilder;
+class GlobalOp;
 } // namespace fir
 
 namespace mlir {
@@ -37,7 +38,7 @@ namespace fir::runtime {
 /// Create the list of environment variable defaults for the runtime to set. The
 /// form of the generated list is defined in the runtime header file
 /// environment-default-list.h
-void genEnvironmentDefaults(
+fir::GlobalOp genEnvironmentDefaults(
     fir::FirOpBuilder &builder, mlir::Location loc,
     const std::vector<Fortran::lower::EnvironmentDefault> &envDefaults);
 
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Main.h b/flang/include/flang/Optimizer/Builder/Runtime/Main.h
new file mode 100644
index 000000000000..62faf46e1fc7
--- /dev/null
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Main.h
@@ -0,0 +1,28 @@
+//===-- Main.h - generate main runtime API calls ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_BUILDER_RUNTIME_MAIN_H
+#define FORTRAN_OPTIMIZER_BUILDER_RUNTIME_MAIN_H
+
+namespace mlir {
+class Location;
+} // namespace mlir
+
+namespace fir {
+class FirOpBuilder;
+class GlobalOp;
+} // namespace fir
+
+namespace fir::runtime {
+
+void genMain(fir::FirOpBuilder &builder, mlir::Location loc,
+             fir::GlobalOp &env);
+
+}
+
+#endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_MAIN_H
diff --git a/flang/include/flang/Optimizer/Dialect/CanonicalizationPatterns.td b/flang/include/flang/Optimizer/Dialect/CanonicalizationPatterns.td
index 544fc3cdf75e..0ef37a37ce94 100644
--- a/flang/include/flang/Optimizer/Dialect/CanonicalizationPatterns.td
+++ b/flang/include/flang/Optimizer/Dialect/CanonicalizationPatterns.td
@@ -21,17 +21,18 @@ include "flang/Optimizer/Dialect/FIROps.td"
 
 def IdenticalTypePred : Constraint<CPred<"$0.getType() == $1.getType()">>;
 def IntegerTypePred : Constraint<CPred<"fir::isa_integer($0.getType())">>;
-def IndexTypePred : Constraint<CPred<"$0.getType().isa<mlir::IndexType>()">>;
+def IndexTypePred : Constraint<CPred<
+                       "mlir::isa<mlir::IndexType>($0.getType())">>;
 
 // Widths are monotonic.
 //   $0.bits >= $1.bits >= $2.bits or $0.bits <= $1.bits <= $2.bits
 def MonotonicTypePred
-    : Constraint<CPred<"(($0.getType().isa<mlir::IntegerType>() && "
-                       "  $1.getType().isa<mlir::IntegerType>() && "
-                       "  $2.getType().isa<mlir::IntegerType>()) || "
-                       " ($0.getType().isa<mlir::FloatType>() && "
-                       "  $1.getType().isa<mlir::FloatType>() && "
-                       "  $2.getType().isa<mlir::FloatType>())) && "
+    : Constraint<CPred<"((mlir::isa<mlir::IntegerType>($0.getType()) && "
+                       "  mlir::isa<mlir::IntegerType>($1.getType()) && "
+                       "  mlir::isa<mlir::IntegerType>($2.getType())) || "
+                       " (mlir::isa<mlir::FloatType>($0.getType()) && "
+                       "  mlir::isa<mlir::FloatType>($1.getType()) && "
+                       "  mlir::isa<mlir::FloatType>($2.getType()))) && "
                        "(($0.getType().getIntOrFloatBitWidth() <= "
                        "  $1.getType().getIntOrFloatBitWidth() && "
                        "  $1.getType().getIntOrFloatBitWidth() <= "
@@ -42,8 +43,8 @@ def MonotonicTypePred
                        "  $2.getType().getIntOrFloatBitWidth()))">>;
 
 def IntPred : Constraint<CPred<
-                       "$0.getType().isa<mlir::IntegerType>() && "
-                       "$1.getType().isa<mlir::IntegerType>()">>;
+                       "mlir::isa<mlir::IntegerType>($0.getType()) && "
+                       "mlir::isa<mlir::IntegerType>($1.getType())">>;
                        
 // If both are int type and the first is smaller than the second.
 //   $0.bits <= $1.bits
@@ -101,8 +102,8 @@ def CombineConvertTruncOptPattern
 def createConstantOp
     : NativeCodeCall<"$_builder.create<mlir::arith::ConstantOp>"
                      "($_loc, $_builder.getIndexType(), "
-                     "rewriter.getIndexAttr($1.dyn_cast<mlir::IntegerAttr>()"
-                     ".getInt()))">;
+                     "rewriter.getIndexAttr("
+                     "mlir::dyn_cast<mlir::IntegerAttr>($1).getInt()))">;
 
 def ForwardConstantConvertPattern
     : Pat<(fir_ConvertOp:$res (Arith_ConstantOp:$cnt $attr)),
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 92790a691e47..496193e25cab 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -2708,14 +2708,14 @@ def fir_ConvertOp : fir_OneResultOp<"convert", [NoMemoryEffect]> {
   let hasCanonicalizer = 1;
 }
 
-def FortranTypeAttr : Attr<And<[CPred<"$_self.isa<mlir::TypeAttr>()">,
-    Or<[CPred<"$_self.cast<mlir::TypeAttr>().getValue().isa<fir::CharacterType,"
-              "fir::ComplexType, fir::IntegerType, fir::LogicalType,"
-              "fir::RealType, fir::RecordType>()">]>]>,
-    "Fortran surface type"> {
+def FortranTypeAttr : Attr<And<[CPred<"mlir::isa<mlir::TypeAttr>($_self)">,
+    Or<[CPred<"mlir::isa<fir::CharacterType, fir::ComplexType, "
+              "fir::IntegerType, fir::LogicalType, fir::RealType, "
+              "fir::RecordType>(mlir::cast<mlir::TypeAttr>($_self).getValue())"
+    >]>]>, "Fortran surface type"> {
   let storageType = [{ ::mlir::TypeAttr }];
   let returnType = "mlir::Type";
-  let convertFromStorage = "$_self.getValue().cast<mlir::Type>()";
+  let convertFromStorage = "mlir::cast<mlir::Type>($_self.getValue())";
 }
 
 def fir_TypeDescOp : fir_OneResultOp<"type_desc", [NoMemoryEffect]> {
diff --git a/flang/include/flang/Optimizer/Dialect/FIRType.h b/flang/include/flang/Optimizer/Dialect/FIRType.h
index 7fcd9c1babf2..b4344435db9f 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRType.h
+++ b/flang/include/flang/Optimizer/Dialect/FIRType.h
@@ -97,35 +97,36 @@ bool isa_fir_or_std_type(mlir::Type t);
 
 /// Is `t` a FIR dialect type that implies a memory (de)reference?
 inline bool isa_ref_type(mlir::Type t) {
-  return t.isa<fir::ReferenceType, fir::PointerType, fir::HeapType,
-               fir::LLVMPointerType>();
+  return mlir::isa<fir::ReferenceType, fir::PointerType, fir::HeapType,
+                   fir::LLVMPointerType>(t);
 }
 
 /// Is `t` a boxed type?
 inline bool isa_box_type(mlir::Type t) {
-  return t.isa<fir::BaseBoxType, fir::BoxCharType, fir::BoxProcType>();
+  return mlir::isa<fir::BaseBoxType, fir::BoxCharType, fir::BoxProcType>(t);
 }
 
 /// Is `t` a type that is always trivially pass-by-reference? Specifically, this
 /// is testing if `t` is a ReferenceType or any box type. Compare this to
 /// conformsWithPassByRef(), which includes pointers and allocatables.
 inline bool isa_passbyref_type(mlir::Type t) {
-  return t.isa<fir::ReferenceType, mlir::FunctionType>() || isa_box_type(t);
+  return mlir::isa<fir::ReferenceType, mlir::FunctionType>(t) ||
+         isa_box_type(t);
 }
 
 /// Is `t` a type that can conform to be pass-by-reference? Depending on the
 /// context, these types may simply demote to pass-by-reference or a reference
 /// to them may have to be passed instead. Functions are always referent.
 inline bool conformsWithPassByRef(mlir::Type t) {
-  return isa_ref_type(t) || isa_box_type(t) || t.isa<mlir::FunctionType>();
+  return isa_ref_type(t) || isa_box_type(t) || mlir::isa<mlir::FunctionType>(t);
 }
 
 /// Is `t` a derived (record) type?
-inline bool isa_derived(mlir::Type t) { return t.isa<fir::RecordType>(); }
+inline bool isa_derived(mlir::Type t) { return mlir::isa<fir::RecordType>(t); }
 
 /// Is `t` type(c_ptr) or type(c_funptr)?
 inline bool isa_builtin_cptr_type(mlir::Type t) {
-  if (auto recTy = t.dyn_cast_or_null<fir::RecordType>())
+  if (auto recTy = mlir::dyn_cast_or_null<fir::RecordType>(t))
     return recTy.getName().ends_with("T__builtin_c_ptr") ||
            recTy.getName().ends_with("T__builtin_c_funptr");
   return false;
@@ -133,7 +134,7 @@ inline bool isa_builtin_cptr_type(mlir::Type t) {
 
 /// Is `t` a FIR dialect aggregate type?
 inline bool isa_aggregate(mlir::Type t) {
-  return t.isa<SequenceType, mlir::TupleType>() || fir::isa_derived(t);
+  return mlir::isa<SequenceType, mlir::TupleType>(t) || fir::isa_derived(t);
 }
 
 /// Extract the `Type` pointed to from a FIR memory reference type. If `t` is
@@ -146,17 +147,17 @@ mlir::Type dyn_cast_ptrOrBoxEleTy(mlir::Type t);
 
 /// Is `t` a FIR Real or MLIR Float type?
 inline bool isa_real(mlir::Type t) {
-  return t.isa<fir::RealType, mlir::FloatType>();
+  return mlir::isa<fir::RealType, mlir::FloatType>(t);
 }
 
 /// Is `t` an integral type?
 inline bool isa_integer(mlir::Type t) {
-  return t.isa<mlir::IndexType, mlir::IntegerType, fir::IntegerType>();
+  return mlir::isa<mlir::IndexType, mlir::IntegerType, fir::IntegerType>(t);
 }
 
 /// Is `t` a vector type?
 inline bool isa_vector(mlir::Type t) {
-  return t.isa<mlir::VectorType, fir::VectorType>();
+  return mlir::isa<mlir::VectorType, fir::VectorType>(t);
 }
 
 mlir::Type parseFirType(FIROpsDialect *, mlir::DialectAsmParser &parser);
@@ -169,22 +170,22 @@ void verifyIntegralType(mlir::Type type);
 
 /// Is `t` a FIR or MLIR Complex type?
 inline bool isa_complex(mlir::Type t) {
-  return t.isa<fir::ComplexType, mlir::ComplexType>();
+  return mlir::isa<fir::ComplexType, mlir::ComplexType>(t);
 }
 
 /// Is `t` a CHARACTER type? Does not check the length.
-inline bool isa_char(mlir::Type t) { return t.isa<fir::CharacterType>(); }
+inline bool isa_char(mlir::Type t) { return mlir::isa<fir::CharacterType>(t); }
 
 /// Is `t` a trivial intrinsic type? CHARACTER is <em>excluded</em> because it
 /// is a dependent type.
 inline bool isa_trivial(mlir::Type t) {
   return isa_integer(t) || isa_real(t) || isa_complex(t) || isa_vector(t) ||
-         t.isa<fir::LogicalType>();
+         mlir::isa<fir::LogicalType>(t);
 }
 
 /// Is `t` a CHARACTER type with a LEN other than 1?
 inline bool isa_char_string(mlir::Type t) {
-  if (auto ct = t.dyn_cast_or_null<fir::CharacterType>())
+  if (auto ct = mlir::dyn_cast_or_null<fir::CharacterType>(t))
     return ct.getLen() != fir::CharacterType::singleton();
   return false;
 }
@@ -198,7 +199,7 @@ bool isa_unknown_size_box(mlir::Type t);
 
 /// Returns true iff `t` is a fir.char type and has an unknown length.
 inline bool characterWithDynamicLen(mlir::Type t) {
-  if (auto charTy = t.dyn_cast<fir::CharacterType>())
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(t))
     return charTy.hasDynamicLen();
   return false;
 }
@@ -213,11 +214,11 @@ inline bool sequenceWithNonConstantShape(fir::SequenceType seqTy) {
 bool hasDynamicSize(mlir::Type t);
 
 inline unsigned getRankOfShapeType(mlir::Type t) {
-  if (auto shTy = t.dyn_cast<fir::ShapeType>())
+  if (auto shTy = mlir::dyn_cast<fir::ShapeType>(t))
     return shTy.getRank();
-  if (auto shTy = t.dyn_cast<fir::ShapeShiftType>())
+  if (auto shTy = mlir::dyn_cast<fir::ShapeShiftType>(t))
     return shTy.getRank();
-  if (auto shTy = t.dyn_cast<fir::ShiftType>())
+  if (auto shTy = mlir::dyn_cast<fir::ShiftType>(t))
     return shTy.getRank();
   return 0;
 }
@@ -225,14 +226,14 @@ inline unsigned getRankOfShapeType(mlir::Type t) {
 /// Get the memory reference type of the data pointer from the box type,
 inline mlir::Type boxMemRefType(fir::BaseBoxType t) {
   auto eleTy = t.getEleTy();
-  if (!eleTy.isa<fir::PointerType, fir::HeapType>())
+  if (!mlir::isa<fir::PointerType, fir::HeapType>(eleTy))
     eleTy = fir::ReferenceType::get(t);
   return eleTy;
 }
 
 /// If `t` is a SequenceType return its element type, otherwise return `t`.
 inline mlir::Type unwrapSequenceType(mlir::Type t) {
-  if (auto seqTy = t.dyn_cast<fir::SequenceType>())
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(t))
     return seqTy.getEleTy();
   return t;
 }
@@ -278,7 +279,7 @@ inline fir::SequenceType unwrapUntilSeqType(mlir::Type t) {
       t = ty;
       continue;
     }
-    if (auto seqTy = t.dyn_cast<fir::SequenceType>())
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(t))
       return seqTy;
     return {};
   }
@@ -287,8 +288,8 @@ inline fir::SequenceType unwrapUntilSeqType(mlir::Type t) {
 /// Unwrap the referential and sequential outer types (if any). Returns the
 /// the element if type is fir::RecordType
 inline fir::RecordType unwrapIfDerived(fir::BaseBoxType boxTy) {
-  return fir::unwrapSequenceType(fir::unwrapRefType(boxTy.getEleTy()))
-      .template dyn_cast<fir::RecordType>();
+  return mlir::dyn_cast<fir::RecordType>(
+      fir::unwrapSequenceType(fir::unwrapRefType(boxTy.getEleTy())));
 }
 
 /// Return true iff `boxTy` wraps a fir::RecordType with length parameters
@@ -377,7 +378,7 @@ bool isRecordWithDescriptorMember(mlir::Type ty);
 
 /// Return true iff `ty` is a RecordType with type parameters.
 inline bool isRecordWithTypeParameters(mlir::Type ty) {
-  if (auto recTy = ty.dyn_cast_or_null<fir::RecordType>())
+  if (auto recTy = mlir::dyn_cast_or_null<fir::RecordType>(ty))
     return recTy.isDependentType();
   return false;
 }
@@ -401,14 +402,14 @@ mlir::Type fromRealTypeID(mlir::MLIRContext *context, llvm::Type::TypeID typeID,
 int getTypeCode(mlir::Type ty, const KindMapping &kindMap);
 
 inline bool BaseBoxType::classof(mlir::Type type) {
-  return type.isa<fir::BoxType, fir::ClassType>();
+  return mlir::isa<fir::BoxType, fir::ClassType>(type);
 }
 
 /// Return true iff `ty` is none or fir.array<none>.
 inline bool isNoneOrSeqNone(mlir::Type type) {
-  if (auto seqTy = type.dyn_cast<fir::SequenceType>())
-    return seqTy.getEleTy().isa<mlir::NoneType>();
-  return type.isa<mlir::NoneType>();
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(type))
+    return mlir::isa<mlir::NoneType>(seqTy.getEleTy());
+  return mlir::isa<mlir::NoneType>(type);
 }
 
 /// Return a fir.box<T> or fir.class<T> if the type is polymorphic. If the type
@@ -428,16 +429,16 @@ inline mlir::Type wrapInClassOrBoxType(mlir::Type eleTy,
 /// !fir.array<2xf32> -> !fir.array<2xnone>
 /// !fir.heap<!fir.array<2xf32>> -> !fir.heap<!fir.array<2xnone>>
 inline mlir::Type updateTypeForUnlimitedPolymorphic(mlir::Type ty) {
-  if (auto seqTy = ty.dyn_cast<fir::SequenceType>())
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
     return fir::SequenceType::get(
         seqTy.getShape(), updateTypeForUnlimitedPolymorphic(seqTy.getEleTy()));
-  if (auto heapTy = ty.dyn_cast<fir::HeapType>())
+  if (auto heapTy = mlir::dyn_cast<fir::HeapType>(ty))
     return fir::HeapType::get(
         updateTypeForUnlimitedPolymorphic(heapTy.getEleTy()));
-  if (auto pointerTy = ty.dyn_cast<fir::PointerType>())
+  if (auto pointerTy = mlir::dyn_cast<fir::PointerType>(ty))
     return fir::PointerType::get(
         updateTypeForUnlimitedPolymorphic(pointerTy.getEleTy()));
-  if (!ty.isa<mlir::NoneType, fir::RecordType>())
+  if (!mlir::isa<mlir::NoneType, fir::RecordType>(ty))
     return mlir::NoneType::get(ty.getContext());
   return ty;
 }
@@ -451,18 +452,19 @@ mlir::Type changeElementType(mlir::Type type, mlir::Type newElementType,
 
 /// Is `t` an address to fir.box or class type?
 inline bool isBoxAddress(mlir::Type t) {
-  return fir::isa_ref_type(t) && fir::unwrapRefType(t).isa<fir::BaseBoxType>();
+  return fir::isa_ref_type(t) &&
+         mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(t));
 }
 
 /// Is `t` a fir.box or class address or value type?
 inline bool isBoxAddressOrValue(mlir::Type t) {
-  return fir::unwrapRefType(t).isa<fir::BaseBoxType>();
+  return mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(t));
 }
 
 /// Is this a fir.boxproc address type?
 inline bool isBoxProcAddressType(mlir::Type t) {
   t = fir::dyn_cast_ptrEleTy(t);
-  return t && t.isa<fir::BoxProcType>();
+  return t && mlir::isa<fir::BoxProcType>(t);
 }
 
 /// Return a string representation of `ty`.
diff --git a/flang/include/flang/Optimizer/Dialect/FIRTypes.td b/flang/include/flang/Optimizer/Dialect/FIRTypes.td
index 3b876e4642da..7378ed93944c 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRTypes.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRTypes.td
@@ -578,7 +578,7 @@ def fir_VoidType : FIR_Type<"Void", "void"> {
 
 // Whether a type is a BaseBoxType
 def IsBaseBoxTypePred
-        : CPred<"$_self.isa<::fir::BaseBoxType>()">;
+        : CPred<"mlir::isa<::fir::BaseBoxType>($_self)">;
 def fir_BaseBoxType : Type<IsBaseBoxTypePred, "fir.box or fir.class type">;
 
 // Generalized FIR and standard dialect types representing intrinsic types
diff --git a/flang/include/flang/Optimizer/Dialect/FortranVariableInterface.td b/flang/include/flang/Optimizer/Dialect/FortranVariableInterface.td
index 6405afbf1bfb..3f78a93a2515 100644
--- a/flang/include/flang/Optimizer/Dialect/FortranVariableInterface.td
+++ b/flang/include/flang/Optimizer/Dialect/FortranVariableInterface.td
@@ -75,7 +75,7 @@ def fir_FortranVariableOpInterface : OpInterface<"FortranVariableOpInterface"> {
     /// variable.
     mlir::Type getElementOrSequenceType() {
       mlir::Type type = fir::unwrapPassByRefType(fir::unwrapRefType(getBase().getType()));
-      if (auto boxCharType = type.dyn_cast<fir::BoxCharType>())
+      if (auto boxCharType = mlir::dyn_cast<fir::BoxCharType>(type))
         return boxCharType.getEleTy();
       return type;
     }
@@ -87,13 +87,13 @@ def fir_FortranVariableOpInterface : OpInterface<"FortranVariableOpInterface"> {
 
     /// Is the variable an array?
     bool isArray() {
-      return getElementOrSequenceType().isa<fir::SequenceType>();
+      return mlir::isa<fir::SequenceType>(getElementOrSequenceType());
     }
 
     /// Return the rank of the entity if it is known at compile time.
     std::optional<unsigned> getRank() {
       if (auto sequenceType =
-            getElementOrSequenceType().dyn_cast<fir::SequenceType>()) {
+            mlir::dyn_cast<fir::SequenceType>(getElementOrSequenceType())) {
         if (sequenceType.hasUnknownShape())
           return {};
         return sequenceType.getDimension();
@@ -133,7 +133,7 @@ def fir_FortranVariableOpInterface : OpInterface<"FortranVariableOpInterface"> {
 
     /// Is this a Fortran character variable?
     bool isCharacter() {
-      return getElementType().isa<fir::CharacterType>();
+      return mlir::isa<fir::CharacterType>(getElementType());
     }
 
     /// Is this a Fortran character variable with an explicit length?
@@ -149,7 +149,7 @@ def fir_FortranVariableOpInterface : OpInterface<"FortranVariableOpInterface"> {
 
     /// Is this variable represented as a fir.box or fir.class value?
     bool isBoxValue() {
-      return getBase().getType().isa<fir::BaseBoxType>();
+      return mlir::isa<fir::BaseBoxType>(getBase().getType());
     }
 
     /// Is this variable represented as a fir.box or fir.class address?
diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIRDialect.h b/flang/include/flang/Optimizer/HLFIR/HLFIRDialect.h
index aa68d0811c48..3830237f96f3 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIRDialect.h
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIRDialect.h
@@ -40,9 +40,9 @@ namespace hlfir {
 inline mlir::Type getFortranElementType(mlir::Type type) {
   type = fir::unwrapSequenceType(
       fir::unwrapPassByRefType(fir::unwrapRefType(type)));
-  if (auto exprType = type.dyn_cast<hlfir::ExprType>())
+  if (auto exprType = mlir::dyn_cast<hlfir::ExprType>(type))
     return exprType.getEleTy();
-  if (auto boxCharType = type.dyn_cast<fir::BoxCharType>())
+  if (auto boxCharType = mlir::dyn_cast<fir::BoxCharType>(type))
     return boxCharType.getEleTy();
   return type;
 }
@@ -51,12 +51,12 @@ inline mlir::Type getFortranElementType(mlir::Type type) {
 /// fir.array type. Otherwise, returns the Fortran element typeof the entity.
 inline mlir::Type getFortranElementOrSequenceType(mlir::Type type) {
   type = fir::unwrapPassByRefType(fir::unwrapRefType(type));
-  if (auto exprType = type.dyn_cast<hlfir::ExprType>()) {
+  if (auto exprType = mlir::dyn_cast<hlfir::ExprType>(type)) {
     if (exprType.isArray())
       return fir::SequenceType::get(exprType.getShape(), exprType.getEleTy());
     return exprType.getEleTy();
   }
-  if (auto boxCharType = type.dyn_cast<fir::BoxCharType>())
+  if (auto boxCharType = mlir::dyn_cast<fir::BoxCharType>(type))
     return boxCharType.getEleTy();
   return type;
 }
@@ -64,16 +64,16 @@ inline mlir::Type getFortranElementOrSequenceType(mlir::Type type) {
 /// Is this a fir.box or fir.class address type?
 inline bool isBoxAddressType(mlir::Type type) {
   type = fir::dyn_cast_ptrEleTy(type);
-  return type && type.isa<fir::BaseBoxType>();
+  return type && mlir::isa<fir::BaseBoxType>(type);
 }
 
 /// Is this a fir.box or fir.class address or value type?
 inline bool isBoxAddressOrValueType(mlir::Type type) {
-  return fir::unwrapRefType(type).isa<fir::BaseBoxType>();
+  return mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(type));
 }
 
 inline bool isPolymorphicType(mlir::Type type) {
-  if (auto exprType = type.dyn_cast<hlfir::ExprType>())
+  if (auto exprType = mlir::dyn_cast<hlfir::ExprType>(type))
     return exprType.isPolymorphic();
   return fir::isPolymorphicType(type);
 }
@@ -81,14 +81,14 @@ inline bool isPolymorphicType(mlir::Type type) {
 /// Is this an SSA value type for the value of a Fortran procedure
 /// designator ?
 inline bool isFortranProcedureValue(mlir::Type type) {
-  return type.isa<fir::BoxProcType>() ||
-         (type.isa<mlir::TupleType>() &&
+  return mlir::isa<fir::BoxProcType>(type) ||
+         (mlir::isa<mlir::TupleType>(type) &&
           fir::isCharacterProcedureTuple(type, /*acceptRawFunc=*/false));
 }
 
 /// Is this an SSA value type for the value of a Fortran expression?
 inline bool isFortranValueType(mlir::Type type) {
-  return type.isa<hlfir::ExprType>() || fir::isa_trivial(type) ||
+  return mlir::isa<hlfir::ExprType>(type) || fir::isa_trivial(type) ||
          isFortranProcedureValue(type);
 }
 
diff --git a/flang/include/flang/Optimizer/Support/Utils.h b/flang/include/flang/Optimizer/Support/Utils.h
index 2b4fa50e0e42..2da6f24da40e 100644
--- a/flang/include/flang/Optimizer/Support/Utils.h
+++ b/flang/include/flang/Optimizer/Support/Utils.h
@@ -29,7 +29,9 @@
 namespace fir {
 /// Return the integer value of a arith::ConstantOp.
 inline std::int64_t toInt(mlir::arith::ConstantOp cop) {
-  return cop.getValue().cast<mlir::IntegerAttr>().getValue().getSExtValue();
+  return mlir::cast<mlir::IntegerAttr>(cop.getValue())
+      .getValue()
+      .getSExtValue();
 }
 
 // Reconstruct binding tables for dynamic dispatch.
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
index fd7a4a3883c9..547fe742967a 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -47,6 +47,8 @@ namespace fir {
 #define GEN_PASS_DECL_POLYMORPHICOPCONVERSION
 #define GEN_PASS_DECL_OPENACCDATAOPERANDCONVERSION
 #define GEN_PASS_DECL_ADDDEBUGINFO
+#define GEN_PASS_DECL_STACKARRAYS
+#define GEN_PASS_DECL_LOOPVERSIONING
 #include "flang/Optimizer/Transforms/Passes.h.inc"
 
 std::unique_ptr<mlir::Pass> createAffineDemotionPass();
@@ -57,15 +59,10 @@ std::unique_ptr<mlir::Pass>
 createExternalNameConversionPass(bool appendUnderscore);
 std::unique_ptr<mlir::Pass> createMemDataFlowOptPass();
 std::unique_ptr<mlir::Pass> createPromoteToAffinePass();
-std::unique_ptr<mlir::Pass> createMemoryAllocationPass();
-std::unique_ptr<mlir::Pass> createStackArraysPass();
 std::unique_ptr<mlir::Pass> createAliasTagsPass();
 std::unique_ptr<mlir::Pass>
 createAddDebugInfoPass(fir::AddDebugInfoOptions options = {});
-std::unique_ptr<mlir::Pass> createLoopVersioningPass();
 
-std::unique_ptr<mlir::Pass>
-createMemoryAllocationPass(bool dynOnHeap, std::size_t maxStackSize);
 std::unique_ptr<mlir::Pass> createAnnotateConstantOperandsPass();
 std::unique_ptr<mlir::Pass> createAlgebraicSimplificationPass();
 std::unique_ptr<mlir::Pass>
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index c3d5c336af40..020b8a6b64a9 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -247,7 +247,6 @@ def MemoryAllocationOpt : Pass<"memory-allocation-opt", "mlir::func::FuncOp"> {
            "std::size_t", /*default=*/"~static_cast<std::size_t>(0)",
            "Set maximum number of elements of an array allocated on the stack.">
   ];
-  let constructor = "::fir::createMemoryAllocationPass()";
 }
 
 def StackArrays : Pass<"stack-arrays", "mlir::ModuleOp"> {
@@ -257,7 +256,6 @@ def StackArrays : Pass<"stack-arrays", "mlir::ModuleOp"> {
     allocations.
   }];
   let dependentDialects = [ "fir::FIROpsDialect" ];
-  let constructor = "::fir::createStackArraysPass()";
 }
 
 def AddAliasTags : Pass<"fir-add-alias-tags", "mlir::ModuleOp"> {
@@ -321,7 +319,6 @@ def LoopVersioning : Pass<"loop-versioning", "mlir::func::FuncOp"> {
     an array has element sized stride. The element sizes stride allows some
     loops to be vectorized as well as other loop optimizations.
   }];
-  let constructor = "::fir::createLoopVersioningPass()";
   let dependentDialects = [ "fir::FIROpsDialect" ];
 }
 
diff --git a/flang/include/flang/Runtime/descriptor.h b/flang/include/flang/Runtime/descriptor.h
index 96d56d9b43a6..1b0b7e23ce6c 100644
--- a/flang/include/flang/Runtime/descriptor.h
+++ b/flang/include/flang/Runtime/descriptor.h
@@ -456,6 +456,7 @@ public:
     assert(descriptor().rank() <= maxRank);
     assert(descriptor().SizeInBytes() <= byteSize);
     if (DescriptorAddendum * addendum{descriptor().Addendum()}) {
+      (void)addendum;
       assert(hasAddendum);
       assert(addendum->LenParameters() <= maxLengthTypeParameters);
     } else {
diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
index f24716333d9a..34af9f1c21f8 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -104,7 +104,7 @@ void addNestedPassToOps(mlir::PassManager &pm, PassConstructor ctor) {
 void addNestedPassToAllTopLevelOperations(
     mlir::PassManager &pm, PassConstructor ctor) {
   addNestedPassToOps<mlir::func::FuncOp, mlir::omp::DeclareReductionOp,
-      fir::GlobalOp>(pm, ctor);
+      mlir::omp::PrivateClauseOp, fir::GlobalOp>(pm, ctor);
 }
 
 void addNestedPassToAllTopLevelOperationsConditionally(mlir::PassManager &pm,
@@ -163,8 +163,8 @@ inline void addAVC(
 
 inline void addMemoryAllocationOpt(mlir::PassManager &pm) {
   addNestedPassConditionally<mlir::func::FuncOp>(pm, disableFirMao, [&]() {
-    return fir::createMemoryAllocationPass(
-        dynamicArrayStackToHeapAllocation, arrayStackAllocationThreshold);
+    return fir::createMemoryAllocationOpt(
+        {dynamicArrayStackToHeapAllocation, arrayStackAllocationThreshold});
   });
 }
 
@@ -253,12 +253,12 @@ inline void createDefaultFIROptimizerPassPipeline(
   }
 
   if (pc.LoopVersioning)
-    pm.addPass(fir::createLoopVersioningPass());
+    pm.addPass(fir::createLoopVersioning());
 
   pm.addPass(mlir::createCSEPass());
 
   if (pc.StackArrays)
-    pm.addPass(fir::createStackArraysPass());
+    pm.addPass(fir::createStackArrays());
   else
     fir::addMemoryAllocationOpt(pm);
 
diff --git a/flang/include/flang/Tools/PointerModels.h b/flang/include/flang/Tools/PointerModels.h
index 7acaf2f9fda5..c3c0977d6e54 100644
--- a/flang/include/flang/Tools/PointerModels.h
+++ b/flang/include/flang/Tools/PointerModels.h
@@ -20,7 +20,7 @@ struct OpenMPPointerLikeModel
     : public mlir::omp::PointerLikeType::ExternalModel<
           OpenMPPointerLikeModel<T>, T> {
   mlir::Type getElementType(mlir::Type pointer) const {
-    return pointer.cast<T>().getElementType();
+    return mlir::cast<T>(pointer).getElementType();
   }
 };
 
@@ -29,7 +29,7 @@ struct OpenACCPointerLikeModel
     : public mlir::acc::PointerLikeType::ExternalModel<
           OpenACCPointerLikeModel<T>, T> {
   mlir::Type getElementType(mlir::Type pointer) const {
-    return pointer.cast<T>().getElementType();
+    return mlir::cast<T>(pointer).getElementType();
   }
 };
 
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 8e84ea2fc5d5..a1957c0eb1bb 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -162,7 +162,7 @@ static void genRuntimeInitCharacter(fir::FirOpBuilder &builder,
   args.push_back(builder.createConvert(loc, inputTypes[0], box.getAddr()));
   args.push_back(builder.createConvert(loc, inputTypes[1], len));
   if (kind == 0)
-    kind = box.getEleTy().cast<fir::CharacterType>().getFKind();
+    kind = mlir::cast<fir::CharacterType>(box.getEleTy()).getFKind();
   args.push_back(builder.createIntegerConstant(loc, inputTypes[2], kind));
   int rank = box.rank();
   args.push_back(builder.createIntegerConstant(loc, inputTypes[3], rank));
@@ -879,7 +879,7 @@ void Fortran::lower::genDeallocateIfAllocated(
   builder.genIfThen(loc, isAllocated)
       .genThen([&]() {
         if (mlir::Type eleType = box.getEleTy();
-            eleType.isa<fir::RecordType>() && box.isPolymorphic()) {
+            mlir::isa<fir::RecordType>(eleType) && box.isPolymorphic()) {
           mlir::Value declaredTypeDesc = builder.create<fir::TypeDescOp>(
               loc, mlir::TypeAttr::get(eleType));
           genDeallocateBox(converter, box, loc, sym, declaredTypeDesc);
@@ -918,7 +918,7 @@ void Fortran::lower::genDeallocateStmt(
     mlir::Value declaredTypeDesc = {};
     if (box.isPolymorphic()) {
       mlir::Type eleType = box.getEleTy();
-      if (eleType.isa<fir::RecordType>())
+      if (mlir::isa<fir::RecordType>(eleType))
         if (const Fortran::semantics::DerivedTypeSpec *derivedTypeSpec =
                 symbol.GetType()->AsDerived()) {
           declaredTypeDesc =
@@ -1007,7 +1007,7 @@ createMutableProperties(Fortran::lower::AbstractConverter &converter,
   fir::MutableProperties mutableProperties;
   std::string name = converter.mangleName(sym);
   mlir::Type baseAddrTy = converter.genType(sym);
-  if (auto boxType = baseAddrTy.dyn_cast<fir::BaseBoxType>())
+  if (auto boxType = mlir::dyn_cast<fir::BaseBoxType>(baseAddrTy))
     baseAddrTy = boxType.getEleTy();
   // Allocate and set a variable to hold the address.
   // It will be set to null in setUnallocatedStatus.
@@ -1032,9 +1032,9 @@ createMutableProperties(Fortran::lower::AbstractConverter &converter,
   mlir::Type eleTy = baseAddrTy;
   if (auto newTy = fir::dyn_cast_ptrEleTy(eleTy))
     eleTy = newTy;
-  if (auto seqTy = eleTy.dyn_cast<fir::SequenceType>())
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(eleTy))
     eleTy = seqTy.getEleTy();
-  if (auto record = eleTy.dyn_cast<fir::RecordType>())
+  if (auto record = mlir::dyn_cast<fir::RecordType>(eleTy))
     if (record.getNumLenParams() != 0)
       TODO(loc, "deferred length type parameters.");
   if (fir::isa_char(eleTy) && nonDeferredParams.empty()) {
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index f66607dfa22f..b42909eaaacc 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -36,6 +36,7 @@
 #include "flang/Optimizer/Builder/Runtime/Character.h"
 #include "flang/Optimizer/Builder/Runtime/Derived.h"
 #include "flang/Optimizer/Builder/Runtime/EnvironmentDefaults.h"
+#include "flang/Optimizer/Builder/Runtime/Main.h"
 #include "flang/Optimizer/Builder/Runtime/Ragged.h"
 #include "flang/Optimizer/Builder/Runtime/Stop.h"
 #include "flang/Optimizer/Builder/Todo.h"
@@ -359,8 +360,10 @@ public:
         // not need to be generated even if no defaults are specified.
         // However, generating main or changing when the runtime reads
         // environment variables is required to do so.
-        fir::runtime::genEnvironmentDefaults(*builder, toLocation(),
-                                             bridge.getEnvironmentDefaults());
+        auto env = fir::runtime::genEnvironmentDefaults(
+            *builder, toLocation(), bridge.getEnvironmentDefaults());
+
+        fir::runtime::genMain(*builder, toLocation(), env);
       });
 
     finalizeOpenACCLowering();
@@ -683,7 +686,7 @@ public:
           auto if_builder = builder->genIfThenElse(loc, isAllocated);
           if_builder.genThen([&]() {
             std::string name = mangleName(sym) + ".alloc";
-            if (auto seqTy = symType.dyn_cast<fir::SequenceType>()) {
+            if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(symType)) {
               fir::ExtendedValue read = fir::factory::genMutableBoxRead(
                   *builder, loc, box, /*mayBePolymorphic=*/false);
               if (auto read_arr_box = read.getBoxOf<fir::ArrayBoxValue>()) {
@@ -1132,7 +1135,7 @@ private:
     fir::ExtendedValue lhs = symBoxToExtendedValue(lhs_sb);
     fir::ExtendedValue rhs = symBoxToExtendedValue(rhs_sb);
     mlir::Type symType = genType(sym);
-    if (auto seqTy = symType.dyn_cast<fir::SequenceType>()) {
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(symType)) {
       Fortran::lower::StatementContext stmtCtx;
       Fortran::lower::createSomeArrayAssignment(*this, lhs, rhs, localSymbols,
                                                 stmtCtx);
@@ -1355,7 +1358,7 @@ private:
       return;
     }
     mlir::Type selectorType = selector.getType();
-    bool realSelector = selectorType.isa<mlir::FloatType>();
+    bool realSelector = mlir::isa<mlir::FloatType>(selectorType);
     assert((inArithmeticIfContext || !realSelector) && "invalid selector type");
     mlir::Value zero;
     if (inArithmeticIfContext)
@@ -1630,7 +1633,7 @@ private:
         stmtCtx);
     stmtCtx.finalizeAndReset();
     // Raise an exception if REAL expr is a NaN.
-    if (expr.getType().isa<mlir::FloatType>())
+    if (mlir::isa<mlir::FloatType>(expr.getType()))
       expr = builder->create<mlir::arith::AddFOp>(toLocation(), expr, expr);
     // An empty valueList indicates to genMultiwayBranch that the branch is
     // an ArithmeticIfStmt that has two branches on value 0 or 0.0.
@@ -2807,7 +2810,7 @@ private:
     auto caseValue = valueList.begin();
     auto caseBlock = blockList.begin();
     for (mlir::Attribute attr : attrList) {
-      if (attr.isa<mlir::UnitAttr>()) {
+      if (mlir::isa<mlir::UnitAttr>(attr)) {
         genBranch(*caseBlock++);
         break;
       }
@@ -2825,7 +2828,7 @@ private:
                                             rhsVal.second);
       };
       mlir::Block *newBlock = insertBlock(*caseBlock);
-      if (attr.isa<fir::ClosedIntervalAttr>()) {
+      if (mlir::isa<fir::ClosedIntervalAttr>(attr)) {
         mlir::Block *newBlock2 = insertBlock(*caseBlock);
         mlir::Value cond =
             genCond(*caseValue++, mlir::arith::CmpIPredicate::sge);
@@ -2838,12 +2841,12 @@ private:
         continue;
       }
       mlir::arith::CmpIPredicate pred;
-      if (attr.isa<fir::PointIntervalAttr>()) {
+      if (mlir::isa<fir::PointIntervalAttr>(attr)) {
         pred = mlir::arith::CmpIPredicate::eq;
-      } else if (attr.isa<fir::LowerBoundAttr>()) {
+      } else if (mlir::isa<fir::LowerBoundAttr>(attr)) {
         pred = mlir::arith::CmpIPredicate::sge;
       } else {
-        assert(attr.isa<fir::UpperBoundAttr>() && "unexpected predicate");
+        assert(mlir::isa<fir::UpperBoundAttr>(attr) && "unexpected predicate");
         pred = mlir::arith::CmpIPredicate::sle;
       }
       mlir::Value cond = genCond(*caseValue++, pred);
@@ -3105,7 +3108,7 @@ private:
         bool isPointer = fir::isPointerType(baseTy);
         bool isAllocatable = fir::isAllocatableType(baseTy);
         bool isArray =
-            fir::dyn_cast_ptrOrBoxEleTy(baseTy).isa<fir::SequenceType>();
+            mlir::isa<fir::SequenceType>(fir::dyn_cast_ptrOrBoxEleTy(baseTy));
         const fir::BoxValue *selectorBox = selector.getBoxOf<fir::BoxValue>();
         if (std::holds_alternative<Fortran::parser::Default>(guard.u)) {
           // CLASS DEFAULT
@@ -3114,12 +3117,12 @@ private:
                        std::get_if<Fortran::parser::TypeSpec>(&guard.u)) {
           // TYPE IS
           fir::ExactTypeAttr attr =
-              typeGuardAttr.dyn_cast<fir::ExactTypeAttr>();
+              mlir::dyn_cast<fir::ExactTypeAttr>(typeGuardAttr);
           mlir::Value exactValue;
           mlir::Type addrTy = attr.getType();
           if (isArray) {
-            auto seqTy = fir::dyn_cast_ptrOrBoxEleTy(baseTy)
-                             .dyn_cast<fir::SequenceType>();
+            auto seqTy = mlir::dyn_cast<fir::SequenceType>(
+                fir::dyn_cast_ptrOrBoxEleTy(baseTy));
             addrTy = fir::SequenceType::get(seqTy.getShape(), attr.getType());
           }
           if (isPointer)
@@ -3141,7 +3144,7 @@ private:
               addAssocEntitySymbol(selectorBox->clone(exact));
             } else if (intrinsic->category() ==
                        Fortran::common::TypeCategory::Character) {
-              auto charTy = attr.getType().dyn_cast<fir::CharacterType>();
+              auto charTy = mlir::dyn_cast<fir::CharacterType>(attr.getType());
               mlir::Value charLen =
                   fir::factory::CharacterExprHelper(*builder, loc)
                       .readLengthFromBox(fir::getBase(selector), charTy);
@@ -3158,11 +3161,12 @@ private:
         } else if (std::holds_alternative<Fortran::parser::DerivedTypeSpec>(
                        guard.u)) {
           // CLASS IS
-          fir::SubclassAttr attr = typeGuardAttr.dyn_cast<fir::SubclassAttr>();
+          fir::SubclassAttr attr =
+              mlir::dyn_cast<fir::SubclassAttr>(typeGuardAttr);
           mlir::Type addrTy = attr.getType();
           if (isArray) {
-            auto seqTy = fir::dyn_cast_ptrOrBoxEleTy(baseTy)
-                             .dyn_cast<fir::SequenceType>();
+            auto seqTy = mlir::dyn_cast<fir::SequenceType>(
+                fir::dyn_cast_ptrOrBoxEleTy(baseTy));
             addrTy = fir::SequenceType::get(seqTy.getShape(), attr.getType());
           }
           if (isPointer)
@@ -3806,16 +3810,34 @@ private:
     return temps;
   }
 
+  // Check if the insertion point is currently in a device context. HostDevice
+  // subprogram are not considered fully device context so it will return false
+  // for it.
+  static bool isDeviceContext(fir::FirOpBuilder &builder) {
+    if (builder.getRegion().getParentOfType<fir::CUDAKernelOp>())
+      return true;
+    if (auto funcOp =
+            builder.getRegion().getParentOfType<mlir::func::FuncOp>()) {
+      if (auto cudaProcAttr =
+              funcOp.getOperation()->getAttrOfType<fir::CUDAProcAttributeAttr>(
+                  fir::getCUDAAttrName())) {
+        return cudaProcAttr.getValue() != fir::CUDAProcAttribute::Host &&
+               cudaProcAttr.getValue() != fir::CUDAProcAttribute::HostDevice;
+      }
+    }
+    return false;
+  }
+
   void genDataAssignment(
       const Fortran::evaluate::Assignment &assign,
       const Fortran::evaluate::ProcedureRef *userDefinedAssignment) {
     mlir::Location loc = getCurrentLocation();
     fir::FirOpBuilder &builder = getFirOpBuilder();
 
-    bool isInDeviceContext =
-        builder.getRegion().getParentOfType<fir::CUDAKernelOp>();
-    bool isCUDATransfer = Fortran::evaluate::HasCUDAAttrs(assign.lhs) ||
-                          Fortran::evaluate::HasCUDAAttrs(assign.rhs);
+    bool isInDeviceContext = isDeviceContext(builder);
+    bool isCUDATransfer = (Fortran::evaluate::HasCUDAAttrs(assign.lhs) ||
+                           Fortran::evaluate::HasCUDAAttrs(assign.rhs)) &&
+                          !isInDeviceContext;
     bool hasCUDAImplicitTransfer =
         Fortran::evaluate::HasCUDAImplicitTransfer(assign.rhs);
     llvm::SmallVector<mlir::Value> implicitTemps;
@@ -3878,7 +3900,7 @@ private:
       Fortran::lower::StatementContext localStmtCtx;
       hlfir::Entity rhs = evaluateRhs(localStmtCtx);
       hlfir::Entity lhs = evaluateLhs(localStmtCtx);
-      if (isCUDATransfer && !hasCUDAImplicitTransfer && !isInDeviceContext)
+      if (isCUDATransfer && !hasCUDAImplicitTransfer)
         genCUDADataTransfer(builder, loc, assign, lhs, rhs);
       else
         builder.create<hlfir::AssignOp>(loc, rhs, lhs,
@@ -4139,7 +4161,7 @@ private:
               } else if (isDerivedCategory(lhsType->category())) {
                 // Handle parent component.
                 if (Fortran::lower::isParentComponent(assign.lhs)) {
-                  if (!fir::getBase(lhs).getType().isa<fir::BaseBoxType>())
+                  if (!mlir::isa<fir::BaseBoxType>(fir::getBase(lhs).getType()))
                     lhs = fir::getBase(builder->createBox(loc, lhs));
                   lhs = Fortran::lower::updateBoxForParentComponent(*this, lhs,
                                                                     assign.lhs);
@@ -5490,7 +5512,7 @@ Fortran::lower::LoweringBridge::LoweringBridge(
     default:
       break;
     }
-    if (!diag.getLocation().isa<mlir::UnknownLoc>())
+    if (!mlir::isa<mlir::UnknownLoc>(diag.getLocation()))
       os << diag.getLocation() << ": ";
     os << diag << '\n';
     os.flush();
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index 5ad244600328..c1f54ad39287 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -1182,7 +1182,7 @@ private:
       Property prop = Property::BaseAddress;
       if (isValueAttr) {
         bool isBuiltinCptrType = fir::isa_builtin_cptr_type(type);
-        if (isBindC || (!type.isa<fir::SequenceType>() &&
+        if (isBindC || (!mlir::isa<fir::SequenceType>(type) &&
                         !obj.attrs.test(Attrs::Optional) &&
                         (dynamicType.category() !=
                              Fortran::common::TypeCategory::Derived ||
@@ -1190,7 +1190,7 @@ private:
           passBy = PassEntityBy::Value;
           prop = Property::Value;
           if (isBuiltinCptrType) {
-            auto recTy = type.dyn_cast<fir::RecordType>();
+            auto recTy = mlir::dyn_cast<fir::RecordType>(type);
             mlir::Type fieldTy = recTy.getTypeList()[0].second;
             passType = fir::ReferenceType::get(fieldTy);
           } else {
@@ -1714,7 +1714,7 @@ mlir::Type Fortran::lower::getDummyProcedureType(
 }
 
 bool Fortran::lower::isCPtrArgByValueType(mlir::Type ty) {
-  return ty.isa<fir::ReferenceType>() &&
+  return mlir::isa<fir::ReferenceType>(ty) &&
          fir::isa_integer(fir::unwrapRefType(ty));
 }
 
diff --git a/flang/lib/Lower/ConvertArrayConstructor.cpp b/flang/lib/Lower/ConvertArrayConstructor.cpp
index 24aa9beba6bf..a5b5838fe6b6 100644
--- a/flang/lib/Lower/ConvertArrayConstructor.cpp
+++ b/flang/lib/Lower/ConvertArrayConstructor.cpp
@@ -336,7 +336,7 @@ public:
       if (!extent)
         extent = builder.createIntegerConstant(loc, builder.getIndexType(), 0);
       if (missingLengthParameters) {
-        if (declaredType.getEleTy().isa<fir::CharacterType>())
+        if (mlir::isa<fir::CharacterType>(declaredType.getEleTy()))
           emboxLengths.push_back(builder.createIntegerConstant(
               loc, builder.getCharacterLengthType(), 0));
         else
@@ -357,7 +357,7 @@ public:
 
   bool useSimplePushRuntime(hlfir::Entity value) {
     return value.isScalar() &&
-           !arrayConstructorElementType.isa<fir::CharacterType>() &&
+           !mlir::isa<fir::CharacterType>(arrayConstructorElementType) &&
            !fir::isRecordWithAllocatableMember(arrayConstructorElementType) &&
            !fir::isRecordWithTypeParameters(arrayConstructorElementType);
   }
@@ -370,7 +370,7 @@ public:
       auto [addrExv, cleanUp] = hlfir::convertToAddress(
           loc, builder, value, arrayConstructorElementType);
       mlir::Value addr = fir::getBase(addrExv);
-      if (addr.getType().isa<fir::BaseBoxType>())
+      if (mlir::isa<fir::BaseBoxType>(addr.getType()))
         addr = builder.create<fir::BoxAddrOp>(loc, addr);
       fir::runtime::genPushArrayConstructorSimpleScalar(
           loc, builder, arrayConstructorVector, addr);
@@ -564,7 +564,7 @@ struct LengthAndTypeCollector<Character<Kind>> {
 /// lowering an ac-value and must be delayed?
 static bool missingLengthParameters(mlir::Type elementType,
                                     llvm::ArrayRef<mlir::Value> lengths) {
-  return (elementType.isa<fir::CharacterType>() ||
+  return (mlir::isa<fir::CharacterType>(elementType) ||
           fir::isRecordWithTypeParameters(elementType)) &&
          lengths.empty();
 }
@@ -702,7 +702,8 @@ static ArrayCtorLoweringStrategy selectArrayCtorLoweringStrategy(
   // Based on what was gathered and the result of the analysis, select and
   // instantiate the right lowering strategy for the array constructor.
   if (!extent || needToEvaluateOneExprToGetLengthParameters ||
-      analysis.anyArrayExpr || declaredType.getEleTy().isa<fir::RecordType>())
+      analysis.anyArrayExpr ||
+      mlir::isa<fir::RecordType>(declaredType.getEleTy()))
     return RuntimeTempStrategy(
         loc, builder, stmtCtx, symMap, declaredType,
         extent ? std::optional<mlir::Value>(extent) : std::nullopt, lengths,
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index c6f7d3410ad5..e4a0cc8d4730 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -49,15 +49,15 @@ static fir::ExtendedValue toExtendedValue(mlir::Location loc, mlir::Value base,
                                           llvm::ArrayRef<mlir::Value> extents,
                                           llvm::ArrayRef<mlir::Value> lengths) {
   mlir::Type type = base.getType();
-  if (type.isa<fir::BaseBoxType>())
+  if (mlir::isa<fir::BaseBoxType>(type))
     return fir::BoxValue(base, /*lbounds=*/{}, lengths, extents);
   type = fir::unwrapRefType(type);
-  if (type.isa<fir::BaseBoxType>())
+  if (mlir::isa<fir::BaseBoxType>(type))
     return fir::MutableBoxValue(base, lengths, /*mutableProperties*/ {});
-  if (auto seqTy = type.dyn_cast<fir::SequenceType>()) {
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(type)) {
     if (seqTy.getDimension() != extents.size())
       fir::emitFatalError(loc, "incorrect number of extents for array");
-    if (seqTy.getEleTy().isa<fir::CharacterType>()) {
+    if (mlir::isa<fir::CharacterType>(seqTy.getEleTy())) {
       if (lengths.empty())
         fir::emitFatalError(loc, "missing length for character");
       assert(lengths.size() == 1);
@@ -65,7 +65,7 @@ static fir::ExtendedValue toExtendedValue(mlir::Location loc, mlir::Value base,
     }
     return fir::ArrayBoxValue(base, extents);
   }
-  if (type.isa<fir::CharacterType>()) {
+  if (mlir::isa<fir::CharacterType>(type)) {
     if (lengths.empty())
       fir::emitFatalError(loc, "missing length for character");
     assert(lengths.size() == 1);
@@ -193,7 +193,7 @@ static mlir::Value remapActualToDummyDescriptor(
   llvm::SmallVector<mlir::Value> lengths;
   mlir::Type dummyBoxType = caller.getDummyArgumentType(arg);
   mlir::Type dummyBaseType = fir::unwrapPassByRefType(dummyBoxType);
-  if (dummyBaseType.isa<fir::SequenceType>())
+  if (mlir::isa<fir::SequenceType>(dummyBaseType))
     caller.walkDummyArgumentExtents(
         arg, [&](const Fortran::lower::SomeExpr &e, bool isAssumedSizeExtent) {
           extents.emplace_back(lowerSpecExpr(e, isAssumedSizeExtent));
@@ -338,7 +338,7 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
     if (!caller.callerAllocateResult())
       return {};
     mlir::Type type = caller.getResultStorageType();
-    if (type.isa<fir::SequenceType>())
+    if (mlir::isa<fir::SequenceType>(type))
       caller.walkResultExtents(
           [&](const Fortran::lower::SomeExpr &e, bool isAssumedSizeExtent) {
             assert(!isAssumedSizeExtent && "result cannot be assumed-size");
@@ -353,7 +353,7 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
     // Result length parameters should not be provided to box storage
     // allocation and save_results, but they are still useful information to
     // keep in the ExtendedValue if non-deferred.
-    if (!type.isa<fir::BoxType>()) {
+    if (!mlir::isa<fir::BoxType>(type)) {
       if (fir::isa_char(fir::unwrapSequenceType(type)) && lengths.empty()) {
         // Calling an assumed length function. This is only possible if this
         // is a call to a character dummy procedure.
@@ -478,7 +478,7 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
   // FIR.
   if (funcPointer) {
     operands.push_back(
-        funcPointer.getType().isa<fir::BoxProcType>()
+        mlir::isa<fir::BoxProcType>(funcPointer.getType())
             ? builder.create<fir::BoxAddrOp>(loc, funcType, funcPointer)
             : builder.createConvert(loc, funcType, funcPointer));
   }
@@ -492,8 +492,8 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
     // arguments of any type and vice versa.
     mlir::Value cast;
     auto *context = builder.getContext();
-    if (snd.isa<fir::BoxProcType>() &&
-        fst.getType().isa<mlir::FunctionType>()) {
+    if (mlir::isa<fir::BoxProcType>(snd) &&
+        mlir::isa<mlir::FunctionType>(fst.getType())) {
       auto funcTy =
           mlir::FunctionType::get(context, std::nullopt, std::nullopt);
       auto boxProcTy = builder.getBoxProcType(funcTy);
@@ -734,9 +734,9 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
 
   // Call a BIND(C) function that return a char.
   if (caller.characterize().IsBindC() &&
-      funcType.getResults()[0].isa<fir::CharacterType>()) {
+      mlir::isa<fir::CharacterType>(funcType.getResults()[0])) {
     fir::CharacterType charTy =
-        funcType.getResults()[0].dyn_cast<fir::CharacterType>();
+        mlir::dyn_cast<fir::CharacterType>(funcType.getResults()[0]);
     mlir::Value len = builder.createIntegerConstant(
         loc, builder.getCharacterLengthType(), charTy.getLen());
     return {fir::CharBoxValue{callResult, len}, /*resultIsFinalized=*/false};
@@ -890,7 +890,7 @@ extendedValueToHlfirEntity(mlir::Location loc, fir::FirOpBuilder &builder,
   mlir::Type firBaseTy = firBase.getType();
   if (fir::isa_trivial(firBaseTy))
     return hlfir::EntityWithAttributes{firBase};
-  if (auto charTy = firBase.getType().dyn_cast<fir::CharacterType>()) {
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(firBase.getType())) {
     // CHAR() intrinsic and BIND(C) procedures returning CHARACTER(1)
     // are lowered to a fir.char<kind,1> that is not in memory.
     // This tends to cause a lot of bugs because the rest of the
@@ -1061,7 +1061,7 @@ static hlfir::Entity fixProcedureDummyMismatch(mlir::Location loc,
                                                fir::FirOpBuilder &builder,
                                                hlfir::Entity actual,
                                                mlir::Type dummyType) {
-  if (actual.getType().isa<fir::BoxProcType>() &&
+  if (mlir::isa<fir::BoxProcType>(actual.getType()) &&
       fir::isCharacterProcedureTuple(dummyType)) {
     mlir::Value length =
         builder.create<fir::UndefOp>(loc, builder.getCharacterLengthType());
@@ -1070,7 +1070,7 @@ static hlfir::Entity fixProcedureDummyMismatch(mlir::Location loc,
     return hlfir::Entity{tuple};
   }
   assert(fir::isCharacterProcedureTuple(actual.getType()) &&
-         dummyType.isa<fir::BoxProcType>() &&
+         mlir::isa<fir::BoxProcType>(dummyType) &&
          "unsupported dummy procedure mismatch with the actual argument");
   mlir::Value boxProc = fir::factory::extractCharacterProcedureTuple(
                             builder, loc, actual, /*openBoxProc=*/false)
@@ -1143,7 +1143,7 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
     assert(actual.isProcedure());
     // Do nothing if this is a procedure argument. It is already a
     // fir.boxproc/fir.tuple<fir.boxproc, len> as it should.
-    if (!actual.getType().isa<fir::BoxProcType>() &&
+    if (!mlir::isa<fir::BoxProcType>(actual.getType()) &&
         actual.getType() != dummyType)
       // The actual argument may be a procedure that returns character (a
       // fir.tuple<fir.boxproc, len>) while the dummy is not. Extract the tuple
@@ -1164,7 +1164,7 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
   // dynamic type matters to determine the contiguity.
   const bool mustSetDynamicTypeToDummyType =
       passingPolymorphicToNonPolymorphic &&
-      (actual.isArray() || dummyType.isa<fir::BaseBoxType>());
+      (actual.isArray() || mlir::isa<fir::BaseBoxType>(dummyType));
 
   // The simple contiguity of the actual is "lost" when passing a polymorphic
   // to a non polymorphic entity because the dummy dynamic type matters for
@@ -1236,7 +1236,7 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
       preparedDummy.pushExprAssociateCleanUp(associate);
     } else if (mustDoCopyInOut) {
       // Copy-in non contiguous variables.
-      assert(entity.getType().isa<fir::BaseBoxType>() &&
+      assert(mlir::isa<fir::BaseBoxType>(entity.getType()) &&
              "expect non simply contiguous variables to be boxes");
       if (actualIsAssumedRank)
         TODO(loc, "copy-in and copy-out of assumed-rank arguments");
@@ -1294,13 +1294,14 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
   // Step 3: now that the dummy argument storage has been prepared, package
   // it according to the interface.
   mlir::Value addr;
-  if (dummyTypeWithActualRank.isa<fir::BoxCharType>()) {
+  if (mlir::isa<fir::BoxCharType>(dummyTypeWithActualRank)) {
     addr = hlfir::genVariableBoxChar(loc, builder, entity);
-  } else if (dummyTypeWithActualRank.isa<fir::BaseBoxType>()) {
+  } else if (mlir::isa<fir::BaseBoxType>(dummyTypeWithActualRank)) {
     entity = hlfir::genVariableBox(loc, builder, entity);
     // Ensures the box has the right attributes and that it holds an
     // addendum if needed.
-    fir::BaseBoxType actualBoxType = entity.getType().cast<fir::BaseBoxType>();
+    fir::BaseBoxType actualBoxType =
+        mlir::cast<fir::BaseBoxType>(entity.getType());
     mlir::Type boxEleType = actualBoxType.getEleTy();
     // For now, assume it is not OK to pass the allocatable/pointer
     // descriptor to a non pointer/allocatable dummy. That is a strict
@@ -1567,7 +1568,7 @@ genUserCall(Fortran::lower::PreparedActualArguments &loweredActuals,
         // callee side, and it is illegal to use NULL without a MOLD if any
         // dummy length parameters are assumed.
         mlir::Type boxTy = fir::dyn_cast_ptrEleTy(argTy);
-        assert(boxTy && boxTy.isa<fir::BaseBoxType>() &&
+        assert(boxTy && mlir::isa<fir::BaseBoxType>(boxTy) &&
                "must be a fir.box type");
         mlir::Value boxStorage =
             fir::factory::genNullBoxStorage(builder, loc, boxTy);
@@ -1635,7 +1636,8 @@ genUserCall(Fortran::lower::PreparedActualArguments &loweredActuals,
       caller, callSiteType, callContext.resultType,
       callContext.isElementalProcWithArrayArgs());
   // For procedure pointer function result, just return the call.
-  if (callContext.resultType && callContext.resultType->isa<fir::BoxProcType>())
+  if (callContext.resultType &&
+      mlir::isa<fir::BoxProcType>(*callContext.resultType))
     return hlfir::EntityWithAttributes(fir::getBase(result));
 
   /// Clean-up associations and copy-in.
@@ -2115,9 +2117,9 @@ public:
         hlfir::getFortranElementType(*callContext.resultType);
     // Get result length parameters.
     llvm::SmallVector<mlir::Value> typeParams;
-    if (elementType.isa<fir::CharacterType>() ||
+    if (mlir::isa<fir::CharacterType>(elementType) ||
         fir::isRecordWithTypeParameters(elementType)) {
-      auto charType = elementType.dyn_cast<fir::CharacterType>();
+      auto charType = mlir::dyn_cast<fir::CharacterType>(elementType);
       if (charType && charType.hasConstantLen())
         typeParams.push_back(builder.createIntegerConstant(
             loc, builder.getIndexType(), charType.getLen()));
@@ -2523,7 +2525,7 @@ genIntrinsicRef(const Fortran::evaluate::SpecificIntrinsic *intrinsic,
   }
   std::optional<hlfir::EntityWithAttributes> result = genHLFIRIntrinsicRefCore(
       loweredActuals, intrinsic, argLowering, callContext);
-  if (result && result->getType().isa<hlfir::ExprType>()) {
+  if (result && mlir::isa<hlfir::ExprType>(result->getType())) {
     fir::FirOpBuilder *bldr = &callContext.getBuilder();
     callContext.stmtCtx.attachCleanup(
         [=]() { bldr->create<hlfir::DestroyOp>(loc, *result); });
diff --git a/flang/lib/Lower/ConvertConstant.cpp b/flang/lib/Lower/ConvertConstant.cpp
index ed389bbe4ae5..653e874a969c 100644
--- a/flang/lib/Lower/ConvertConstant.cpp
+++ b/flang/lib/Lower/ConvertConstant.cpp
@@ -184,8 +184,8 @@ private:
     if (!attributeElementType || attributes.empty())
       return {};
 
-    assert(symTy.isa<fir::SequenceType>() && "expecting an array global");
-    auto arrTy = symTy.cast<fir::SequenceType>();
+    assert(mlir::isa<fir::SequenceType>(symTy) && "expecting an array global");
+    auto arrTy = mlir::cast<fir::SequenceType>(symTy);
     llvm::SmallVector<int64_t> tensorShape(arrTy.getShape());
     std::reverse(tensorShape.begin(), tensorShape.end());
     auto tensorTy =
@@ -423,14 +423,14 @@ static mlir::Value genStructureComponentInit(
     // address field, which ought to be an intptr_t on the target.
     mlir::Value addr = fir::getBase(
         Fortran::lower::genExtAddrInInitializer(converter, loc, expr));
-    if (addr.getType().isa<fir::BoxProcType>())
+    if (mlir::isa<fir::BoxProcType>(addr.getType()))
       addr = builder.create<fir::BoxAddrOp>(loc, addr);
     assert((fir::isa_ref_type(addr.getType()) ||
-            addr.getType().isa<mlir::FunctionType>()) &&
+            mlir::isa<mlir::FunctionType>(addr.getType())) &&
            "expect reference type for address field");
     assert(fir::isa_derived(componentTy) &&
            "expect C_PTR, C_FUNPTR to be a record");
-    auto cPtrRecTy = componentTy.cast<fir::RecordType>();
+    auto cPtrRecTy = mlir::cast<fir::RecordType>(componentTy);
     llvm::StringRef addrFieldName = Fortran::lower::builtin::cptrFieldName;
     mlir::Type addrFieldTy = cPtrRecTy.getType(addrFieldName);
     auto addrField = builder.create<fir::FieldIndexOp>(
@@ -460,7 +460,7 @@ static mlir::Value genInlinedStructureCtorLitImpl(
     Fortran::lower::AbstractConverter &converter, mlir::Location loc,
     const Fortran::evaluate::StructureConstructor &ctor, mlir::Type type) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  auto recTy = type.cast<fir::RecordType>();
+  auto recTy = mlir::cast<fir::RecordType>(type);
 
   if (!converter.getLoweringOptions().getLowerToHighLevelFIR()) {
     mlir::Value res = builder.create<fir::UndefOp>(loc, recTy);
@@ -587,7 +587,7 @@ genInlinedArrayLit(Fortran::lower::AbstractConverter &converter,
     } while (con.IncrementSubscripts(subscripts));
   } else if constexpr (T::category == Fortran::common::TypeCategory::Derived) {
     do {
-      mlir::Type eleTy = arrayTy.cast<fir::SequenceType>().getEleTy();
+      mlir::Type eleTy = mlir::cast<fir::SequenceType>(arrayTy).getEleTy();
       mlir::Value elementVal =
           genScalarLit(converter, loc, con.At(subscripts), eleTy,
                        /*outlineInReadOnlyMemory=*/false);
@@ -597,7 +597,7 @@ genInlinedArrayLit(Fortran::lower::AbstractConverter &converter,
   } else {
     llvm::SmallVector<mlir::Attribute> rangeStartIdx;
     uint64_t rangeSize = 0;
-    mlir::Type eleTy = arrayTy.cast<fir::SequenceType>().getEleTy();
+    mlir::Type eleTy = mlir::cast<fir::SequenceType>(arrayTy).getEleTy();
     do {
       auto getElementVal = [&]() {
         return builder.createConvert(loc, eleTy,
@@ -620,12 +620,11 @@ genInlinedArrayLit(Fortran::lower::AbstractConverter &converter,
         llvm::SmallVector<int64_t> rangeBounds;
         llvm::SmallVector<mlir::Attribute> idx = createIdx();
         for (size_t i = 0; i < idx.size(); ++i) {
-          rangeBounds.push_back(rangeStartIdx[i]
-                                    .cast<mlir::IntegerAttr>()
+          rangeBounds.push_back(mlir::cast<mlir::IntegerAttr>(rangeStartIdx[i])
                                     .getValue()
                                     .getSExtValue());
           rangeBounds.push_back(
-              idx[i].cast<mlir::IntegerAttr>().getValue().getSExtValue());
+              mlir::cast<mlir::IntegerAttr>(idx[i]).getValue().getSExtValue());
         }
         array = builder.create<fir::InsertOnRangeOp>(
             loc, arrayTy, array, getElementVal(),
@@ -647,7 +646,7 @@ genOutlineArrayLit(Fortran::lower::AbstractConverter &converter,
                    mlir::Location loc, mlir::Type arrayTy,
                    const Fortran::evaluate::Constant<T> &constant) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  mlir::Type eleTy = arrayTy.cast<fir::SequenceType>().getEleTy();
+  mlir::Type eleTy = mlir::cast<fir::SequenceType>(arrayTy).getEleTy();
   llvm::StringRef globalName = converter.getUniqueLitName(
       loc, std::make_unique<Fortran::lower::SomeExpr>(toEvExpr(constant)),
       eleTy);
diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp
index fb7807718ff8..9567685aa3d2 100644
--- a/flang/lib/Lower/ConvertExpr.cpp
+++ b/flang/lib/Lower/ConvertExpr.cpp
@@ -267,7 +267,7 @@ arrayLoadExtValue(fir::FirOpBuilder &builder, mlir::Location loc,
     mlir::Type ty = fir::applyPathToType(arrTy, path);
     if (!ty)
       fir::emitFatalError(loc, "path does not apply to type");
-    if (!ty.isa<fir::SequenceType>()) {
+    if (!mlir::isa<fir::SequenceType>(ty)) {
       if (fir::isa_char(ty)) {
         mlir::Value len = newLen;
         if (!len)
@@ -282,7 +282,7 @@ arrayLoadExtValue(fir::FirOpBuilder &builder, mlir::Location loc,
       }
       return newBase;
     }
-    arrTy = ty.cast<fir::SequenceType>();
+    arrTy = mlir::cast<fir::SequenceType>(ty);
   }
 
   auto arrayToExtendedValue =
@@ -412,15 +412,15 @@ static fir::ExtendedValue genLoad(fir::FirOpBuilder &builder,
   return addr.match(
       [](const fir::CharBoxValue &box) -> fir::ExtendedValue { return box; },
       [&](const fir::PolymorphicValue &p) -> fir::ExtendedValue {
-        if (fir::unwrapRefType(fir::getBase(p).getType())
-                .isa<fir::RecordType>())
+        if (mlir::isa<fir::RecordType>(
+                fir::unwrapRefType(fir::getBase(p).getType())))
           return p;
         mlir::Value load = builder.create<fir::LoadOp>(loc, fir::getBase(p));
         return fir::PolymorphicValue(load, p.getSourceBox());
       },
       [&](const fir::UnboxedValue &v) -> fir::ExtendedValue {
-        if (fir::unwrapRefType(fir::getBase(v).getType())
-                .isa<fir::RecordType>())
+        if (mlir::isa<fir::RecordType>(
+                fir::unwrapRefType(fir::getBase(v).getType())))
           return v;
         return builder.create<fir::LoadOp>(loc, fir::getBase(v));
       },
@@ -536,8 +536,8 @@ static mlir::Value
 createBoxProcCharTuple(Fortran::lower::AbstractConverter &converter,
                        mlir::Type argTy, mlir::Value funcAddr,
                        mlir::Value charLen) {
-  auto boxTy =
-      argTy.cast<mlir::TupleType>().getType(0).cast<fir::BoxProcType>();
+  auto boxTy = mlir::cast<fir::BoxProcType>(
+      mlir::cast<mlir::TupleType>(argTy).getType(0));
   mlir::Location loc = converter.getCurrentLocation();
   auto &builder = converter.getFirOpBuilder();
 
@@ -549,7 +549,7 @@ createBoxProcCharTuple(Fortran::lower::AbstractConverter &converter,
   mlir::Type toTy = boxTy.getEleTy();
   if (fir::isa_ref_type(fromTy))
     funcAddr = builder.createConvert(loc, toTy, funcAddr);
-  else if (fromTy.isa<fir::BoxProcType>())
+  else if (mlir::isa<fir::BoxProcType>(fromTy))
     funcAddr = builder.create<fir::BoxAddrOp>(loc, toTy, funcAddr);
 
   auto boxProc = [&]() -> mlir::Value {
@@ -575,7 +575,7 @@ absentBoxToUnallocatedBox(fir::FirOpBuilder &builder, mlir::Location loc,
                           mlir::Value isPresent) {
   mlir::Value box = fir::getBase(exv);
   mlir::Type boxType = box.getType();
-  assert(boxType.isa<fir::BoxType>() && "argument must be a fir.box");
+  assert(mlir::isa<fir::BoxType>(boxType) && "argument must be a fir.box");
   mlir::Value emptyBox =
       fir::factory::createUnallocatedBox(builder, loc, boxType, std::nullopt);
   auto safeToReadBox =
@@ -915,7 +915,7 @@ public:
     if (inInitializer)
       return Fortran::lower::genInlinedStructureCtorLit(converter, loc, ctor);
     mlir::Type ty = translateSomeExprToFIRType(converter, toEvExpr(ctor));
-    auto recTy = ty.cast<fir::RecordType>();
+    auto recTy = mlir::cast<fir::RecordType>(ty);
     auto fieldTy = fir::FieldType::get(ty.getContext());
     mlir::Value res = builder.createTemporary(loc, recTy);
     mlir::Value box = builder.createBox(loc, fir::ExtendedValue{res});
@@ -1172,8 +1172,8 @@ public:
     if (!charBox)
       fir::emitFatalError(loc, "expected scalar character");
     mlir::Value charAddr = charBox->getAddr();
-    auto charType =
-        fir::unwrapPassByRefType(charAddr.getType()).cast<fir::CharacterType>();
+    auto charType = mlir::cast<fir::CharacterType>(
+        fir::unwrapPassByRefType(charAddr.getType()));
     if (charType.hasConstantLen()) {
       // Erase previous constant length from the base type.
       fir::CharacterType::LenType newLen = fir::CharacterType::unknownLen();
@@ -1441,7 +1441,7 @@ public:
     auto fldTy = fir::FieldType::get(&converter.getMLIRContext());
     // FIXME: need to thread the LEN type parameters here.
     for (const Fortran::evaluate::Component *field : list) {
-      auto recTy = ty.cast<fir::RecordType>();
+      auto recTy = mlir::cast<fir::RecordType>(ty);
       const Fortran::semantics::Symbol &sym = getLastSym(*field);
       std::string name = converter.getRecordTypeFieldName(sym);
       coorArgs.push_back(builder.create<fir::FieldIndexOp>(
@@ -1478,7 +1478,7 @@ public:
   mlir::Type genSubType(mlir::Type arrTy, unsigned dims) {
     mlir::Type unwrapTy = fir::dyn_cast_ptrOrBoxEleTy(arrTy);
     assert(unwrapTy && "must be a pointer or box type");
-    auto seqTy = unwrapTy.cast<fir::SequenceType>();
+    auto seqTy = mlir::cast<fir::SequenceType>(unwrapTy);
     llvm::ArrayRef<int64_t> shape = seqTy.getShape();
     assert(shape.size() > 0 && "removing columns for sequence sans shape");
     assert(dims <= shape.size() && "removing more columns than exist");
@@ -1550,9 +1550,9 @@ public:
     }
 
     mlir::Type eleTy = fir::dyn_cast_ptrOrBoxEleTy(base.getType());
-    if (auto classTy = eleTy.dyn_cast<fir::ClassType>())
+    if (auto classTy = mlir::dyn_cast<fir::ClassType>(eleTy))
       eleTy = classTy.getEleTy();
-    auto seqTy = eleTy.cast<fir::SequenceType>();
+    auto seqTy = mlir::cast<fir::SequenceType>(eleTy);
     assert(args.size() == seqTy.getDimension());
     mlir::Type ty = builder.getRefType(seqTy.getEleTy());
     auto addr = builder.create<fir::CoordinateOp>(loc, ty, base, args);
@@ -1571,7 +1571,7 @@ public:
     mlir::Location loc = getLoc();
     mlir::Value addr = fir::getBase(array);
     mlir::Type arrTy = fir::dyn_cast_ptrEleTy(addr.getType());
-    auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+    auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
     mlir::Type seqTy = builder.getRefType(builder.getVarLenSeqTy(eleTy));
     mlir::Type refTy = builder.getRefType(eleTy);
     mlir::Value base = builder.createConvert(loc, seqTy, addr);
@@ -1656,7 +1656,7 @@ public:
     mlir::Location loc = getLoc();
     mlir::Value addr = fir::getBase(exv);
     mlir::Type arrTy = fir::dyn_cast_ptrOrBoxEleTy(addr.getType());
-    mlir::Type eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+    mlir::Type eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
     mlir::Type refTy = builder.getRefType(eleTy);
     mlir::IndexType idxTy = builder.getIndexType();
     llvm::SmallVector<mlir::Value> arrayCoorArgs;
@@ -1766,8 +1766,9 @@ public:
     mlir::Location loc = getLoc();
     ExtValue exv = genBoxArg(expr);
     auto exvTy = fir::getBase(exv).getType();
-    if (exvTy.isa<mlir::FunctionType>()) {
-      auto boxProcTy = builder.getBoxProcType(exvTy.cast<mlir::FunctionType>());
+    if (mlir::isa<mlir::FunctionType>(exvTy)) {
+      auto boxProcTy =
+          builder.getBoxProcType(mlir::cast<mlir::FunctionType>(exvTy));
       return builder.create<fir::EmboxProcOp>(loc, boxProcTy,
                                               fir::getBase(exv));
     }
@@ -1861,7 +1862,7 @@ public:
           // IS_CONTIGUOUS may require an assumed size TYPE(*) to be passed to
           // the intrinsic library utility as a fir.box.
           if (argRules.lowerAs == fir::LowerIntrinsicArgAs::Box &&
-              !fir::getBase(exv).getType().isa<fir::BaseBoxType>()) {
+              !mlir::isa<fir::BaseBoxType>(fir::getBase(exv).getType())) {
             operands.emplace_back(
                 fir::factory::createBoxValue(builder, loc, exv));
             continue;
@@ -2005,7 +2006,7 @@ public:
         fir::getTypeParams(mold);
     mlir::Value charLen;
     mlir::Type elementType = fir::unwrapSequenceType(type);
-    if (auto charType = elementType.dyn_cast<fir::CharacterType>()) {
+    if (auto charType = mlir::dyn_cast<fir::CharacterType>(elementType)) {
       charLen = allocMemTypeParams.empty()
                     ? fir::factory::readCharLen(builder, loc, mold)
                     : allocMemTypeParams[0];
@@ -2017,7 +2018,7 @@ public:
 
     mlir::Value temp = builder.create<fir::AllocMemOp>(
         loc, type, tempName, allocMemTypeParams, extents);
-    if (fir::unwrapSequenceType(type).isa<fir::CharacterType>())
+    if (mlir::isa<fir::CharacterType>(fir::unwrapSequenceType(type)))
       return fir::CharArrayBoxValue{temp, charLen, extents};
     return fir::ArrayBoxValue{temp, extents};
   }
@@ -2166,7 +2167,7 @@ public:
         // We have to initialize the temp if it may have components
         // that need initialization. If there are no components
         // requiring initialization, then the call is a no-op.
-        if (getElementTypeOf(temp).isa<fir::RecordType>()) {
+        if (mlir::isa<fir::RecordType>(getElementTypeOf(temp))) {
           mlir::Value tempBox = fir::getBase(builder.createBox(loc, temp));
           fir::runtime::genDerivedTypeInitialize(builder, loc, tempBox);
         }
@@ -2312,7 +2313,7 @@ public:
     if (!copyOutPair.restrictCopyAndFreeAtRuntime) {
       doCopyOut();
 
-      if (fir::getElementTypeOf(copyOutPair.temp).isa<fir::RecordType>()) {
+      if (mlir::isa<fir::RecordType>(fir::getElementTypeOf(copyOutPair.temp))) {
         // Destroy components of the temporary (if any).
         // If there are no components requiring destruction, then the call
         // is a no-op.
@@ -2330,7 +2331,8 @@ public:
     builder.genIfThen(loc, *copyOutPair.restrictCopyAndFreeAtRuntime)
         .genThen([&]() {
           doCopyOut();
-          if (fir::getElementTypeOf(copyOutPair.temp).isa<fir::RecordType>()) {
+          if (mlir::isa<fir::RecordType>(
+                  fir::getElementTypeOf(copyOutPair.temp))) {
             // Destroy components of the temporary (if any).
             // If there are no components requiring destruction, then the call
             // is a no-op.
@@ -2381,7 +2383,7 @@ public:
     mlir::Value actualArgBase = fir::getBase(actualArg);
     mlir::Value isPresent = builder.create<fir::IsPresentOp>(
         loc, builder.getI1Type(), actualArgBase);
-    if (!actualArgBase.getType().isa<fir::BoxType>())
+    if (!mlir::isa<fir::BoxType>(actualArgBase.getType()))
       return {actualArg, isPresent};
     ExtValue safeToReadBox =
         absentBoxToUnallocatedBox(builder, loc, actualArg, isPresent);
@@ -2408,7 +2410,7 @@ public:
                                       fir::getAdaptToByRefAttr(builder)});
       return fir::CharBoxValue{temp, len};
     }
-    assert((fir::isa_trivial(type) || type.isa<fir::RecordType>()) &&
+    assert((fir::isa_trivial(type) || mlir::isa<fir::RecordType>(type)) &&
            "must be simple scalar");
     return builder.createTemporary(loc, type,
                                    llvm::ArrayRef<mlir::NamedAttribute>{
@@ -2585,7 +2587,7 @@ public:
           // callee side, and it is illegal to use NULL without a MOLD if any
           // dummy length parameters are assumed.
           mlir::Type boxTy = fir::dyn_cast_ptrEleTy(argTy);
-          assert(boxTy && boxTy.isa<fir::BaseBoxType>() &&
+          assert(boxTy && mlir::isa<fir::BaseBoxType>(boxTy) &&
                  "must be a fir.box type");
           mlir::Value boxStorage = builder.createTemporary(loc, boxTy);
           mlir::Value nullBox = fir::factory::createUnallocatedBox(
@@ -2643,10 +2645,11 @@ public:
                 // If a character procedure was passed instead, handle the
                 // mismatch.
                 auto funcTy =
-                    x.getAddr().getType().dyn_cast<mlir::FunctionType>();
+                    mlir::dyn_cast<mlir::FunctionType>(x.getAddr().getType());
                 if (funcTy && funcTy.getNumResults() == 1 &&
-                    funcTy.getResult(0).isa<fir::BoxCharType>()) {
-                  auto boxTy = funcTy.getResult(0).cast<fir::BoxCharType>();
+                    mlir::isa<fir::BoxCharType>(funcTy.getResult(0))) {
+                  auto boxTy =
+                      mlir::cast<fir::BoxCharType>(funcTy.getResult(0));
                   mlir::Value ref = builder.createConvert(
                       loc, builder.getRefType(boxTy.getEleTy()), x.getAddr());
                   auto len = builder.create<fir::UndefOp>(
@@ -2667,7 +2670,7 @@ public:
                 // free-casting the base address to be a !fir.char reference and
                 // setting the LEN argument to undefined. What could go wrong?
                 auto dataPtr = fir::getBase(x);
-                assert(!dataPtr.getType().template isa<fir::BoxType>());
+                assert(!mlir::isa<fir::BoxType>(dataPtr.getType()));
                 return builder.convertWithSemantics(
                     loc, argTy, dataPtr,
                     /*allowCharacterConversion=*/true);
@@ -2742,7 +2745,7 @@ public:
                 loc,
                 fir::ClassType::get(mlir::NoneType::get(builder.getContext())),
                 box);
-          } else if (box.getType().isa<fir::BoxType>() &&
+          } else if (mlir::isa<fir::BoxType>(box.getType()) &&
                      fir::isPolymorphicType(argTy)) {
             box = builder.create<fir::ReboxOp>(loc, argTy, box, mlir::Value{},
                                                /*slice=*/mlir::Value{});
@@ -2791,7 +2794,7 @@ public:
                       : builder.createBox(getLoc(), genTempExtAddr(*expr),
                                           fir::isPolymorphicType(argTy),
                                           fir::isAssumedType(argTy));
-            if (box.getType().isa<fir::BoxType>() &&
+            if (mlir::isa<fir::BoxType>(box.getType()) &&
                 fir::isPolymorphicType(argTy) && !fir::isAssumedType(argTy)) {
               mlir::Type actualTy = argTy;
               if (Fortran::lower::isParentComponent(*expr))
@@ -3030,10 +3033,11 @@ private:
       Fortran::common::ScopedSet(semant, PushVal);
 
 static bool isAdjustedArrayElementType(mlir::Type t) {
-  return fir::isa_char(t) || fir::isa_derived(t) || t.isa<fir::SequenceType>();
+  return fir::isa_char(t) || fir::isa_derived(t) ||
+         mlir::isa<fir::SequenceType>(t);
 }
 static bool elementTypeWasAdjusted(mlir::Type t) {
-  if (auto ty = t.dyn_cast<fir::ReferenceType>())
+  if (auto ty = mlir::dyn_cast<fir::ReferenceType>(t))
     return isAdjustedArrayElementType(ty.getEleTy());
   return false;
 }
@@ -3050,15 +3054,15 @@ static void genScalarUserDefinedAssignmentCall(fir::FirOpBuilder &builder,
   auto prepareUserDefinedArg =
       [](fir::FirOpBuilder &builder, mlir::Location loc,
          const fir::ExtendedValue &value, mlir::Type argType) -> mlir::Value {
-    if (argType.isa<fir::BoxCharType>()) {
+    if (mlir::isa<fir::BoxCharType>(argType)) {
       const fir::CharBoxValue *charBox = value.getCharBox();
       assert(charBox && "argument type mismatch in elemental user assignment");
       return fir::factory::CharacterExprHelper{builder, loc}.createEmbox(
           *charBox);
     }
-    if (argType.isa<fir::BaseBoxType>()) {
+    if (mlir::isa<fir::BaseBoxType>(argType)) {
       mlir::Value box =
-          builder.createBox(loc, value, argType.isa<fir::ClassType>());
+          builder.createBox(loc, value, mlir::isa<fir::ClassType>(argType));
       return builder.createConvert(loc, argType, box);
     }
     // Simple pass by address.
@@ -3170,7 +3174,7 @@ convertToArrayBoxValue(mlir::Location loc, fir::FirOpBuilder &builder,
                        mlir::Value val, mlir::Value len) {
   mlir::Type ty = fir::unwrapRefType(val.getType());
   mlir::IndexType idxTy = builder.getIndexType();
-  auto seqTy = ty.cast<fir::SequenceType>();
+  auto seqTy = mlir::cast<fir::SequenceType>(ty);
   auto undef = builder.create<fir::UndefOp>(loc, idxTy);
   llvm::SmallVector<mlir::Value> extents(seqTy.getDimension(), undef);
   if (fir::isa_char(seqTy.getEleTy()))
@@ -3462,7 +3466,7 @@ public:
         [&](const auto &e) {
           auto f = genarr(e);
           ExtValue exv = f(IterationSpace{});
-          if (fir::getBase(exv).getType().template isa<fir::BaseBoxType>())
+          if (mlir::isa<fir::BaseBoxType>(fir::getBase(exv).getType()))
             return exv;
           fir::emitFatalError(getLoc(), "array must be emboxed");
         },
@@ -3487,10 +3491,9 @@ public:
                                            tempRes, dest.getSlice(),
                                            dest.getTypeparams());
 
-    auto arrTy =
-        fir::dyn_cast_ptrEleTy(tempRes.getType()).cast<fir::SequenceType>();
-    if (auto charTy =
-            arrTy.getEleTy().template dyn_cast<fir::CharacterType>()) {
+    auto arrTy = mlir::cast<fir::SequenceType>(
+        fir::dyn_cast_ptrEleTy(tempRes.getType()));
+    if (auto charTy = mlir::dyn_cast<fir::CharacterType>(arrTy.getEleTy())) {
       if (fir::characterWithDynamicLen(charTy))
         TODO(loc, "CHARACTER does not have constant LEN");
       mlir::Value len = builder.createIntegerConstant(
@@ -3912,17 +3915,18 @@ private:
   mlir::Value convertElementForUpdate(mlir::Location loc, mlir::Type eleTy,
                                       mlir::Value origVal) {
     if (auto origEleTy = fir::dyn_cast_ptrEleTy(origVal.getType()))
-      if (origEleTy.isa<fir::BaseBoxType>()) {
+      if (mlir::isa<fir::BaseBoxType>(origEleTy)) {
         // If origVal is a box variable, load it so it is in the value domain.
         origVal = builder.create<fir::LoadOp>(loc, origVal);
       }
-    if (origVal.getType().isa<fir::BoxType>() && !eleTy.isa<fir::BoxType>()) {
+    if (mlir::isa<fir::BoxType>(origVal.getType()) &&
+        !mlir::isa<fir::BoxType>(eleTy)) {
       if (isPointerAssignment())
         TODO(loc, "lhs of pointer assignment returned unexpected value");
       TODO(loc, "invalid box conversion in elemental computation");
     }
-    if (isPointerAssignment() && eleTy.isa<fir::BoxType>() &&
-        !origVal.getType().isa<fir::BoxType>()) {
+    if (isPointerAssignment() && mlir::isa<fir::BoxType>(eleTy) &&
+        !mlir::isa<fir::BoxType>(origVal.getType())) {
       // This is a pointer assignment and the rhs is a raw reference to a TARGET
       // in memory. Embox the reference so it can be stored to the boxed
       // POINTER variable.
@@ -3930,7 +3934,7 @@ private:
       if (auto eleTy = fir::dyn_cast_ptrEleTy(origVal.getType());
           fir::hasDynamicSize(eleTy))
         TODO(loc, "TARGET of pointer assignment with runtime size/shape");
-      auto memrefTy = fir::boxMemRefType(eleTy.cast<fir::BoxType>());
+      auto memrefTy = fir::boxMemRefType(mlir::cast<fir::BoxType>(eleTy));
       auto castTo = builder.createConvert(loc, memrefTy, origVal);
       origVal = builder.create<fir::EmboxOp>(loc, eleTy, castTo);
     }
@@ -3982,7 +3986,7 @@ private:
         auto arrayOp = builder.create<fir::ArrayAccessOp>(
             loc, resRefTy, innerArg, iterSpace.iterVec(),
             fir::factory::getTypeParams(loc, builder, destination));
-        if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+        if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
           llvm::SmallVector<mlir::Value> substringBounds;
           populateBounds(substringBounds, substring);
           mlir::Value dstLen = fir::factory::genLenOfCharacter(
@@ -3996,7 +4000,7 @@ private:
               loc, destination, builder, arrayOp, exv, eleTy, innerArg);
           return abstractArrayExtValue(amend /*FIXME: typeparams?*/);
         }
-        assert(eleTy.isa<fir::SequenceType>() && "must be an array");
+        assert(mlir::isa<fir::SequenceType>(eleTy) && "must be an array");
         TODO(loc, "array (as element) assignment");
       }
       // By value semantics. The element is being assigned by value.
@@ -4060,7 +4064,7 @@ private:
   llvm::SmallVector<mlir::Value> getShape(ArrayOperand array) {
     if (array.slice)
       return computeSliceShape(array.slice);
-    if (array.memref.getType().isa<fir::BaseBoxType>())
+    if (mlir::isa<fir::BaseBoxType>(array.memref.getType()))
       return fir::factory::readExtents(builder, getLoc(),
                                        fir::BoxValue{array.memref});
     return fir::factory::getExtents(array.shape);
@@ -4133,7 +4137,7 @@ private:
     mlir::Location loc = getLoc();
     return [=, builder = &converter.getFirOpBuilder()](IterSpace iters) {
       mlir::Type arrTy = fir::dyn_cast_ptrOrBoxEleTy(tmp.getType());
-      auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+      auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
       mlir::Type eleRefTy = builder->getRefType(eleTy);
       mlir::IntegerType i1Ty = builder->getI1Type();
       // Adjust indices for any shift of the origin of the array.
@@ -4442,15 +4446,15 @@ private:
       TODO(loc, "polymorphic array temporary");
     if (ccLoadDest)
       return (*ccLoadDest)(shape);
-    auto seqTy = type.dyn_cast<fir::SequenceType>();
+    auto seqTy = mlir::dyn_cast<fir::SequenceType>(type);
     assert(seqTy && "must be an array");
     // TODO: Need to thread the LEN parameters here. For character, they may
     // differ from the operands length (e.g concatenation). So the array loads
     // type parameters are not enough.
-    if (auto charTy = seqTy.getEleTy().dyn_cast<fir::CharacterType>())
+    if (auto charTy = mlir::dyn_cast<fir::CharacterType>(seqTy.getEleTy()))
       if (charTy.hasDynamicLen())
         TODO(loc, "character array expression temp with dynamic length");
-    if (auto recTy = seqTy.getEleTy().dyn_cast<fir::RecordType>())
+    if (auto recTy = mlir::dyn_cast<fir::RecordType>(seqTy.getEleTy()))
       if (recTy.getNumLenParams() > 0)
         TODO(loc, "derived type array expression temp with LEN parameters");
     if (mlir::Type eleTy = fir::unwrapSequenceType(type);
@@ -4827,7 +4831,7 @@ private:
             });
           } else {
             ExtValue exv = asScalarRef(*expr);
-            if (fir::getBase(exv).getType().isa<fir::BaseBoxType>()) {
+            if (mlir::isa<fir::BaseBoxType>(fir::getBase(exv).getType())) {
               operands.emplace_back(
                   [=](IterSpace iters) -> ExtValue { return exv; });
             } else {
@@ -5565,7 +5569,7 @@ private:
   }
 
   static mlir::Type unwrapBoxEleTy(mlir::Type ty) {
-    if (auto boxTy = ty.dyn_cast<fir::BaseBoxType>())
+    if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty))
       return fir::unwrapRefType(boxTy.getEleTy());
     return ty;
   }
@@ -5575,7 +5579,7 @@ private:
     ty = unwrapBoxEleTy(ty);
     mlir::Location loc = getLoc();
     mlir::IndexType idxTy = builder.getIndexType();
-    for (auto extent : ty.cast<fir::SequenceType>().getShape()) {
+    for (auto extent : mlir::cast<fir::SequenceType>(ty).getShape()) {
       auto v = extent == fir::SequenceType::getUnknownExtent()
                    ? builder.create<fir::UndefOp>(loc, idxTy).getResult()
                    : builder.createIntegerConstant(loc, idxTy, extent);
@@ -5638,7 +5642,8 @@ private:
     mlir::Location loc = getLoc();
     mlir::Value memref = fir::getBase(extMemref);
     mlir::Type arrTy = fir::dyn_cast_ptrOrBoxEleTy(memref.getType());
-    assert(arrTy.isa<fir::SequenceType>() && "memory ref must be an array");
+    assert(mlir::isa<fir::SequenceType>(arrTy) &&
+           "memory ref must be an array");
     mlir::Value shape = builder.createShape(loc, extMemref);
     mlir::Value slice;
     if (components.isSlice()) {
@@ -5688,12 +5693,12 @@ private:
                                     components.suffixComponents);
       }
       if (components.hasComponents()) {
-        auto seqTy = arrTy.cast<fir::SequenceType>();
+        auto seqTy = mlir::cast<fir::SequenceType>(arrTy);
         mlir::Type eleTy =
             fir::applyPathToType(seqTy.getEleTy(), components.suffixComponents);
         if (!eleTy)
           fir::emitFatalError(loc, "slicing path is ill-formed");
-        if (auto realTy = eleTy.dyn_cast<fir::RealType>())
+        if (auto realTy = mlir::dyn_cast<fir::RealType>(eleTy))
           eleTy = Fortran::lower::convertReal(realTy.getContext(),
                                               realTy.getFKind());
 
@@ -5713,13 +5718,14 @@ private:
       // value. The value of the box is forwarded in the continuation.
       mlir::Type reduceTy = reduceRank(arrTy, slice);
       mlir::Type boxTy = fir::BoxType::get(reduceTy);
-      if (memref.getType().isa<fir::ClassType>() && !components.hasComponents())
+      if (mlir::isa<fir::ClassType>(memref.getType()) &&
+          !components.hasComponents())
         boxTy = fir::ClassType::get(reduceTy);
       if (components.substring) {
         // Adjust char length to substring size.
         fir::CharacterType charTy =
             fir::factory::CharacterExprHelper::getCharType(reduceTy);
-        auto seqTy = reduceTy.cast<fir::SequenceType>();
+        auto seqTy = mlir::cast<fir::SequenceType>(reduceTy);
         // TODO: Use a constant for fir.char LEN if we can compute it.
         boxTy = fir::BoxType::get(
             fir::SequenceType::get(fir::CharacterType::getUnknownLen(
@@ -5734,7 +5740,7 @@ private:
         nonDeferredLenParams = fir::factory::getNonDeferredLenParams(extMemref);
       }
       mlir::Value embox =
-          memref.getType().isa<fir::BaseBoxType>()
+          mlir::isa<fir::BaseBoxType>(memref.getType())
               ? builder.create<fir::ReboxOp>(loc, boxTy, memref, shape, slice)
                     .getResult()
               : builder
@@ -5745,7 +5751,7 @@ private:
         return fir::BoxValue(embox, lbounds, nonDeferredLenParams);
       };
     }
-    auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+    auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
     if (isReferentiallyOpaque()) {
       // Semantics are an opaque reference to an array.
       // This case forwards a continuation that will generate the address
@@ -5760,12 +5766,12 @@ private:
         mlir::Value coor = builder.create<fir::ArrayCoorOp>(
             loc, refEleTy, memref, shape, slice, indices,
             fir::getTypeParams(extMemref));
-        if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+        if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
           llvm::SmallVector<mlir::Value> substringBounds;
           populateBounds(substringBounds, components.substring);
           if (!substringBounds.empty()) {
             mlir::Value dstLen = fir::factory::genLenOfCharacter(
-                builder, loc, arrTy.cast<fir::SequenceType>(), memref,
+                builder, loc, mlir::cast<fir::SequenceType>(arrTy), memref,
                 fir::getTypeParams(extMemref), iters.iterVec(),
                 substringBounds);
             fir::CharBoxValue dstChar(coor, dstLen);
@@ -5863,7 +5869,7 @@ private:
         mlir::Type eleRefTy = builder.getRefType(eleTy);
         mlir::Value arrayOp = builder.create<fir::ArrayAccessOp>(
             loc, eleRefTy, arrLd, iters.iterVec(), arrLdTypeParams);
-        if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+        if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
           llvm::SmallVector<mlir::Value> substringBounds;
           populateBounds(substringBounds, components.substring);
           if (!substringBounds.empty()) {
@@ -5896,7 +5902,7 @@ private:
     const bool hasOptionalAttr =
         fir::valueHasFirAttribute(base, fir::getOptionalAttrName());
     mlir::Type baseType = fir::unwrapRefType(base.getType());
-    const bool isBox = baseType.isa<fir::BoxType>();
+    const bool isBox = mlir::isa<fir::BoxType>(baseType);
     const bool isAllocOrPtr =
         Fortran::evaluate::IsAllocatableOrPointerObject(expr);
     mlir::Type arrType = fir::unwrapPassByRefType(baseType);
@@ -5989,7 +5995,7 @@ private:
     if (slice) {
       auto slOp = mlir::dyn_cast<fir::SliceOp>(slice.getDefiningOp());
       assert(slOp && "expected slice op");
-      auto seqTy = arrTy.dyn_cast<fir::SequenceType>();
+      auto seqTy = mlir::dyn_cast<fir::SequenceType>(arrTy);
       assert(seqTy && "expected array type");
       mlir::Operation::operand_range triples = slOp.getTriples();
       fir::SequenceType::Shape shape;
@@ -6053,7 +6059,7 @@ private:
     mlir::IndexType idxTy = builder.getIndexType();
     mlir::Value multiplier = builder.createIntegerConstant(loc, idxTy, 1);
     if (fir::hasDynamicSize(eleTy)) {
-      if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+      if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
         // Array of char with dynamic LEN parameter. Downcast to an array
         // of singleton char, and scale by the len type parameter from
         // `exv`.
@@ -6074,7 +6080,7 @@ private:
             });
         fir::CharacterType newEleTy = fir::CharacterType::getSingleton(
             eleTy.getContext(), charTy.getFKind());
-        if (auto seqTy = resTy.dyn_cast<fir::SequenceType>()) {
+        if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(resTy)) {
           assert(eleTy == seqTy.getEleTy());
           resTy = fir::SequenceType::get(seqTy.getShape(), newEleTy);
         }
@@ -6161,7 +6167,7 @@ private:
     if (!eleSz) {
       // Compute the element size at runtime.
       assert(fir::hasDynamicSize(eleTy));
-      if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+      if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
         auto charBytes =
             builder.getKindMap().getCharacterBitsize(charTy.getFKind()) / 8;
         mlir::Value bytes =
@@ -6181,7 +6187,7 @@ private:
     auto computeCoordinate = [&](mlir::Value buff, mlir::Value off) {
       mlir::Type refTy = eleRefTy;
       if (fir::hasDynamicSize(eleTy)) {
-        if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+        if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
           // Scale a simple pointer using dynamic length and offset values.
           auto chTy = fir::CharacterType::getSingleton(charTy.getContext(),
                                                        charTy.getFKind());
@@ -6308,7 +6314,7 @@ private:
         builder.createConvert(loc, idxTy, fir::getBase(asScalar(x.upper())));
     mlir::Value step =
         builder.createConvert(loc, idxTy, fir::getBase(asScalar(x.stride())));
-    auto seqTy = resTy.template cast<fir::SequenceType>();
+    auto seqTy = mlir::cast<fir::SequenceType>(resTy);
     mlir::Type eleTy = fir::unwrapSequenceType(seqTy);
     auto loop =
         builder.create<fir::DoLoopOp>(loc, lo, up, step, /*unordered=*/false,
@@ -6375,7 +6381,7 @@ private:
     auto evExpr = toEvExpr(x);
     mlir::Type resTy = translateSomeExprToFIRType(converter, evExpr);
     mlir::IndexType idxTy = builder.getIndexType();
-    auto seqTy = resTy.template cast<fir::SequenceType>();
+    auto seqTy = mlir::cast<fir::SequenceType>(resTy);
     mlir::Type eleTy = fir::unwrapSequenceType(resTy);
     mlir::Value buffSize = builder.createTemporary(loc, idxTy, ".buff.size");
     mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0);
@@ -6719,7 +6725,7 @@ private:
                 auto fieldTy = fir::FieldType::get(builder.getContext());
                 std::string name =
                     converter.getRecordTypeFieldName(getLastSym(*x));
-                if (auto recTy = ty.dyn_cast<fir::RecordType>()) {
+                if (auto recTy = mlir::dyn_cast<fir::RecordType>(ty)) {
                   ty = recTy.getType(name);
                   auto fld = builder.create<fir::FieldIndexOp>(
                       loc, fieldTy, name, recTy, fir::getTypeParams(arrayExv));
@@ -6728,7 +6734,7 @@ private:
                     // Need an intermediate  dereference if the boxed value
                     // appears in the middle of the component path or if it is
                     // on the right and this is not a pointer assignment.
-                    if (auto boxTy = ty.dyn_cast<fir::BaseBoxType>()) {
+                    if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty)) {
                       auto currentFunc = components.getExtendCoorRef();
                       auto loc = getLoc();
                       auto *bldr = &converter.getFirOpBuilder();
@@ -6739,9 +6745,9 @@ private:
                       deref = true;
                     }
                   }
-                } else if (auto boxTy = ty.dyn_cast<fir::BaseBoxType>()) {
+                } else if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty)) {
                   ty = fir::unwrapRefType(boxTy.getEleTy());
-                  auto recTy = ty.cast<fir::RecordType>();
+                  auto recTy = mlir::cast<fir::RecordType>(ty);
                   ty = recTy.getType(name);
                   auto fld = builder.create<fir::FieldIndexOp>(
                       loc, fieldTy, name, recTy, fir::getTypeParams(arrayExv));
@@ -6790,7 +6796,7 @@ private:
           auto arrayOp = builder.create<fir::ArrayAccessOp>(
               loc, eleRefTy, innerArg, iters.iterVec(),
               fir::factory::getTypeParams(loc, builder, load));
-          if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+          if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
             mlir::Value dstLen = fir::factory::genLenOfCharacter(
                 builder, loc, load, iters.iterVec(), substringBounds);
             fir::ArrayAmendOp amend = createCharArrayAmend(
@@ -6806,13 +6812,13 @@ private:
             return arrayLoadExtValue(builder, loc, load, iters.iterVec(),
                                      amend);
           }
-          assert(eleTy.isa<fir::SequenceType>());
+          assert(mlir::isa<fir::SequenceType>(eleTy));
           TODO(loc, "array (as element) assignment");
         }
         if (components.hasExtendCoorRef()) {
           auto eleBoxTy =
               fir::applyPathToType(innerArg.getType(), iters.iterVec());
-          if (!eleBoxTy || !eleBoxTy.isa<fir::BoxType>())
+          if (!eleBoxTy || !mlir::isa<fir::BoxType>(eleBoxTy))
             TODO(loc, "assignment in a FORALL involving a designator with a "
                       "POINTER or ALLOCATABLE component part-ref");
           auto arrayOp = builder.create<fir::ArrayAccessOp>(
@@ -6824,7 +6830,7 @@ private:
           // assignment, then insert the dereference of the box before any
           // conversion and store.
           if (!isPointerAssignment()) {
-            if (auto boxTy = eleTy.dyn_cast<fir::BaseBoxType>()) {
+            if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(eleTy)) {
               eleTy = fir::boxMemRefType(boxTy);
               addr = builder.create<fir::BoxAddrOp>(loc, eleTy, addr);
               eleTy = fir::unwrapRefType(eleTy);
@@ -6885,7 +6891,7 @@ private:
       }
       if (components.hasExtendCoorRef()) {
         auto eleBoxTy = fir::applyPathToType(load.getType(), iters.iterVec());
-        if (!eleBoxTy || !eleBoxTy.isa<fir::BoxType>())
+        if (!eleBoxTy || !mlir::isa<fir::BoxType>(eleBoxTy))
           TODO(loc, "assignment in a FORALL involving a designator with a "
                     "POINTER or ALLOCATABLE component part-ref");
         auto access = builder.create<fir::ArrayAccessOp>(
@@ -6897,7 +6903,7 @@ private:
       }
       if (isPointerAssignment()) {
         auto eleTy = fir::applyPathToType(load.getType(), iters.iterVec());
-        if (!eleTy.isa<fir::BoxType>()) {
+        if (!mlir::isa<fir::BoxType>(eleTy)) {
           // Rhs is a regular expression that will need to be boxed before
           // assigning to the boxed variable.
           auto typeParams = fir::factory::getTypeParams(loc, builder, load);
@@ -7615,7 +7621,7 @@ mlir::Value Fortran::lower::addCrayPointerInst(mlir::Location loc,
   auto box = builder.create<fir::EmboxOp>(loc, boxTy, ptrVal, empty, empty,
                                           emptyRange);
   mlir::Value addrof =
-      (ptrTy.isa<fir::ReferenceType>())
+      (mlir::isa<fir::ReferenceType>(ptrTy))
           ? builder.create<fir::BoxAddrOp>(loc, ptrTy, box)
           : builder.create<fir::BoxAddrOp>(loc, builder.getRefType(ptrTy), box);
 
diff --git a/flang/lib/Lower/ConvertExprToHLFIR.cpp b/flang/lib/Lower/ConvertExprToHLFIR.cpp
index 6e57b31d022b..93bdf650f9ff 100644
--- a/flang/lib/Lower/ConvertExprToHLFIR.cpp
+++ b/flang/lib/Lower/ConvertExprToHLFIR.cpp
@@ -138,8 +138,8 @@ public:
     mlir::Location loc = getLoc();
     mlir::Type idxTy = builder.getIndexType();
     llvm::SmallVector<mlir::Value> extents;
-    auto seqTy = hlfir::getFortranElementOrSequenceType(fieldType)
-                     .cast<fir::SequenceType>();
+    auto seqTy = mlir::cast<fir::SequenceType>(
+        hlfir::getFortranElementOrSequenceType(fieldType));
     for (auto extent : seqTy.getShape()) {
       if (extent == fir::SequenceType::getUnknownExtent()) {
         // We have already generated invalid hlfir.declare
@@ -199,7 +199,7 @@ private:
                                    const T &designatorNode) {
     // Get base's shape if its a sequence type with no previously computed
     // result shape
-    if (partInfo.base && resultValueType.isa<fir::SequenceType>() &&
+    if (partInfo.base && mlir::isa<fir::SequenceType>(resultValueType) &&
         !partInfo.resultShape)
       partInfo.resultShape =
           hlfir::genShape(getLoc(), getBuilder(), *partInfo.base);
@@ -209,7 +209,7 @@ private:
       return fir::ClassType::get(resultValueType);
     // Character scalar with dynamic length needs a fir.boxchar to hold the
     // designator length.
-    auto charType = resultValueType.dyn_cast<fir::CharacterType>();
+    auto charType = mlir::dyn_cast<fir::CharacterType>(resultValueType);
     if (charType && charType.hasDynamicLen())
       return fir::BoxCharType::get(charType.getContext(), charType.getFKind());
     // Arrays with non default lower bounds or dynamic length or dynamic extent
@@ -218,7 +218,7 @@ private:
         hasNonDefaultLowerBounds(partInfo))
       return fir::BoxType::get(resultValueType);
     // Non simply contiguous ref require a fir.box to carry the byte stride.
-    if (resultValueType.isa<fir::SequenceType>() &&
+    if (mlir::isa<fir::SequenceType>(resultValueType) &&
         !Fortran::evaluate::IsSimplyContiguous(
             designatorNode, getConverter().getFoldingContext()))
       return fir::BoxType::get(resultValueType);
@@ -398,8 +398,8 @@ private:
       partInfo.typeParams[0] =
           fir::factory::genMaxWithZero(builder, loc, rawLen);
     }
-    auto kind = hlfir::getFortranElementType(baseStringType)
-                    .cast<fir::CharacterType>()
+    auto kind = mlir::cast<fir::CharacterType>(
+                    hlfir::getFortranElementType(baseStringType))
                     .getFKind();
     auto newCharTy = fir::CharacterType::get(
         baseStringType.getContext(), kind,
@@ -579,7 +579,7 @@ private:
       return createVectorSubscriptElementAddrOp(partInfo, baseType,
                                                 resultExtents);
 
-    mlir::Type resultType = baseType.cast<fir::SequenceType>().getEleTy();
+    mlir::Type resultType = mlir::cast<fir::SequenceType>(baseType).getEleTy();
     if (!resultTypeShape.empty()) {
       // Ranked array section. The result shape comes from the array section
       // subscripts.
@@ -612,8 +612,8 @@ private:
   }
   static bool hasNonDefaultLowerBounds(const PartInfo &partInfo) {
     return partInfo.resultShape &&
-           (partInfo.resultShape.getType().isa<fir::ShiftType>() ||
-            partInfo.resultShape.getType().isa<fir::ShapeShiftType>());
+           mlir::isa<fir::ShiftType, fir::ShapeShiftType>(
+               partInfo.resultShape.getType());
   }
 
   mlir::Type visit(const Fortran::evaluate::Component &component,
@@ -705,7 +705,7 @@ private:
     const Fortran::semantics::Symbol &componentSym = component.GetLastSymbol();
     partInfo.componentName = converter.getRecordTypeFieldName(componentSym);
     auto recordType =
-        hlfir::getFortranElementType(baseType).cast<fir::RecordType>();
+        mlir::cast<fir::RecordType>(hlfir::getFortranElementType(baseType));
     if (recordType.isDependentType())
       TODO(getLoc(), "Designate derived type with length parameters in HLFIR");
     mlir::Type fieldType = recordType.getType(partInfo.componentName);
@@ -718,7 +718,7 @@ private:
     if (fir::isRecordWithTypeParameters(fieldEleType))
       TODO(loc,
            "lower a component that is a parameterized derived type to HLFIR");
-    if (auto charTy = fieldEleType.dyn_cast<fir::CharacterType>()) {
+    if (auto charTy = mlir::dyn_cast<fir::CharacterType>(fieldEleType)) {
       mlir::Location loc = getLoc();
       mlir::Type idxTy = builder.getIndexType();
       if (charTy.hasConstantLen())
@@ -811,7 +811,7 @@ private:
       }
     }
     builder.setInsertionPoint(elementalAddrOp);
-    return baseType.cast<fir::SequenceType>().getEleTy();
+    return mlir::cast<fir::SequenceType>(baseType).getEleTy();
   }
 
   /// Yield the designator for the final part-ref inside the
@@ -1665,7 +1665,7 @@ private:
     mlir::Location loc = getLoc();
     fir::FirOpBuilder &builder = getBuilder();
     mlir::Type ty = translateSomeExprToFIRType(converter, toEvExpr(ctor));
-    auto recTy = ty.cast<fir::RecordType>();
+    auto recTy = mlir::cast<fir::RecordType>(ty);
 
     if (recTy.isDependentType())
       TODO(loc, "structure constructor for derived type with length parameters "
diff --git a/flang/lib/Lower/ConvertProcedureDesignator.cpp b/flang/lib/Lower/ConvertProcedureDesignator.cpp
index 2446be3a1908..aa0d7ce54788 100644
--- a/flang/lib/Lower/ConvertProcedureDesignator.cpp
+++ b/flang/lib/Lower/ConvertProcedureDesignator.cpp
@@ -107,11 +107,11 @@ static hlfir::EntityWithAttributes designateProcedurePointerComponent(
                                                 procComponentSym);
   /// Passed argument may be a descriptor. This is a scalar reference, so the
   /// base address can be directly addressed.
-  if (base.getType().isa<fir::BaseBoxType>())
+  if (mlir::isa<fir::BaseBoxType>(base.getType()))
     base = builder.create<fir::BoxAddrOp>(loc, base);
   std::string fieldName = converter.getRecordTypeFieldName(procComponentSym);
   auto recordType =
-      hlfir::getFortranElementType(base.getType()).cast<fir::RecordType>();
+      mlir::cast<fir::RecordType>(hlfir::getFortranElementType(base.getType()));
   mlir::Type fieldType = recordType.getType(fieldName);
   // Note: semantics turns x%p() into x%t%p() when the procedure pointer
   // component is part of parent component t.
@@ -164,7 +164,7 @@ hlfir::EntityWithAttributes Fortran::lower::convertProcedureDesignatorToHLFIR(
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
 
   mlir::Value funcAddr = fir::getBase(procExv);
-  if (!funcAddr.getType().isa<fir::BoxProcType>()) {
+  if (!mlir::isa<fir::BoxProcType>(funcAddr.getType())) {
     mlir::Type boxTy =
         Fortran::lower::getUntypedBoxProcType(&converter.getMLIRContext());
     if (auto host = Fortran::lower::argumentHostAssocs(converter, funcAddr))
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 21db0cac11bf..413563fe95ca 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -389,13 +389,13 @@ static mlir::Value genDefaultInitializerValue(
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   mlir::Type scalarType = symTy;
   fir::SequenceType sequenceType;
-  if (auto ty = symTy.dyn_cast<fir::SequenceType>()) {
+  if (auto ty = mlir::dyn_cast<fir::SequenceType>(symTy)) {
     sequenceType = ty;
     scalarType = ty.getEleTy();
   }
   // Build a scalar default value of the symbol type, looping through the
   // components to build each component initial value.
-  auto recTy = scalarType.cast<fir::RecordType>();
+  auto recTy = mlir::cast<fir::RecordType>(scalarType);
   mlir::Value initialValue = builder.create<fir::UndefOp>(loc, scalarType);
   const Fortran::semantics::DeclTypeSpec *declTy = sym.GetType();
   assert(declTy && "var with default initialization must have a type");
@@ -493,11 +493,11 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
   // with a tensor mlir type. This optimization currently only supports
   // Fortran arrays of integer, real, complex, or logical. The tensor
   // type does not support nested structures.
-  if (symTy.isa<fir::SequenceType>() &&
+  if (mlir::isa<fir::SequenceType>(symTy) &&
       !Fortran::semantics::IsAllocatableOrPointer(sym)) {
-    mlir::Type eleTy = symTy.cast<fir::SequenceType>().getEleTy();
-    if (eleTy.isa<mlir::IntegerType, mlir::FloatType, fir::ComplexType,
-                  fir::LogicalType>()) {
+    mlir::Type eleTy = mlir::cast<fir::SequenceType>(symTy).getEleTy();
+    if (mlir::isa<mlir::IntegerType, mlir::FloatType, fir::ComplexType,
+                  fir::LogicalType>(eleTy)) {
       const auto *details =
           sym.detailsIf<Fortran::semantics::ObjectEntityDetails>();
       if (details->init()) {
@@ -1292,7 +1292,7 @@ static void finalizeCommonBlockDefinition(
     fir::GlobalOp global,
     const Fortran::semantics::MutableSymbolVector &cmnBlkMems) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  mlir::TupleType commonTy = global.getType().cast<mlir::TupleType>();
+  mlir::TupleType commonTy = mlir::cast<mlir::TupleType>(global.getType());
   auto initFunc = [&](fir::FirOpBuilder &builder) {
     mlir::IndexType idxTy = builder.getIndexType();
     mlir::Value cb = builder.create<fir::ZeroOp>(loc, commonTy);
@@ -1407,7 +1407,7 @@ static bool lowerToBoxValue(const Fortran::semantics::Symbol &sym,
                             mlir::Value dummyArg,
                             Fortran::lower::AbstractConverter &converter) {
   // Only dummy arguments coming as fir.box can be tracked in an BoxValue.
-  if (!dummyArg || !dummyArg.getType().isa<fir::BaseBoxType>())
+  if (!dummyArg || !mlir::isa<fir::BaseBoxType>(dummyArg.getType()))
     return false;
   // Non contiguous arrays must be tracked in an BoxValue.
   if (sym.Rank() > 0 && !Fortran::evaluate::IsSimplyContiguous(
@@ -1905,7 +1905,7 @@ void Fortran::lower::mapSymbolAttributes(
           // Do not keep scalar characters as fir.box (even when optional).
           // Lowering and FIR is not meant to deal with scalar characters as
           // fir.box outside of calls.
-          auto boxTy = dummyArg.getType().dyn_cast<fir::BaseBoxType>();
+          auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(dummyArg.getType());
           mlir::Type refTy = builder.getRefType(boxTy.getEleTy());
           mlir::Type lenType = builder.getCharacterLengthType();
           mlir::Value addr, len;
@@ -1984,8 +1984,8 @@ void Fortran::lower::mapSymbolAttributes(
       // a non pointer/allocatable symbol to be mapped to a MutableBox.
       mlir::Type ty = converter.genType(var);
       bool isPolymorphic = false;
-      if (auto boxTy = ty.dyn_cast<fir::BaseBoxType>()) {
-        isPolymorphic = ty.isa<fir::ClassType>();
+      if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty)) {
+        isPolymorphic = mlir::isa<fir::ClassType>(ty);
         ty = boxTy.getEleTy();
       }
       Fortran::lower::genDeclareSymbol(
@@ -2092,7 +2092,7 @@ void Fortran::lower::mapSymbolAttributes(
   mlir::Value addr = preAlloc;
 
   if (arg)
-    if (auto boxTy = arg.getType().dyn_cast<fir::BaseBoxType>()) {
+    if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(arg.getType())) {
       // Contiguous assumed shape that can be tracked without a fir.box.
       mlir::Type refTy = builder.getRefType(boxTy.getEleTy());
       addr = builder.create<fir::BoxAddrOp>(loc, refTy, arg);
@@ -2134,7 +2134,7 @@ void Fortran::lower::mapSymbolAttributes(
       } else if (!len) {
         // Assumed length fir.box (possible for contiguous assumed shapes).
         // Read length from box.
-        assert(arg && arg.getType().isa<fir::BoxType>() &&
+        assert(arg && mlir::isa<fir::BoxType>(arg.getType()) &&
                "must be character dummy fir.box");
         len = charHelp.readLengthFromBox(arg);
       }
diff --git a/flang/lib/Lower/CustomIntrinsicCall.cpp b/flang/lib/Lower/CustomIntrinsicCall.cpp
index 439fc3d915b4..30c6ce7f53b3 100644
--- a/flang/lib/Lower/CustomIntrinsicCall.cpp
+++ b/flang/lib/Lower/CustomIntrinsicCall.cpp
@@ -227,22 +227,23 @@ lowerIshftc(fir::FirOpBuilder &builder, mlir::Location loc,
   args.push_back(getOperand(1, loadOperand));
   auto iPC = isPresentCheck(2);
   assert(iPC.has_value());
-  args.push_back(builder
-                     .genIfOp(loc, {resultType}, *iPC,
-                              /*withElseRegion=*/true)
-                     .genThen([&]() {
-                       fir::ExtendedValue sizeExv = getOperand(2, loadOperand);
-                       mlir::Value size = builder.createConvert(
-                           loc, resultType, fir::getBase(sizeExv));
-                       builder.create<fir::ResultOp>(loc, size);
-                     })
-                     .genElse([&]() {
-                       mlir::Value bitSize = builder.createIntegerConstant(
-                           loc, resultType,
-                           resultType.cast<mlir::IntegerType>().getWidth());
-                       builder.create<fir::ResultOp>(loc, bitSize);
-                     })
-                     .getResults()[0]);
+  args.push_back(
+      builder
+          .genIfOp(loc, {resultType}, *iPC,
+                   /*withElseRegion=*/true)
+          .genThen([&]() {
+            fir::ExtendedValue sizeExv = getOperand(2, loadOperand);
+            mlir::Value size =
+                builder.createConvert(loc, resultType, fir::getBase(sizeExv));
+            builder.create<fir::ResultOp>(loc, size);
+          })
+          .genElse([&]() {
+            mlir::Value bitSize = builder.createIntegerConstant(
+                loc, resultType,
+                mlir::cast<mlir::IntegerType>(resultType).getWidth());
+            builder.create<fir::ResultOp>(loc, bitSize);
+          })
+          .getResults()[0]);
   return genIntrinsicCall(builder, loc, name, resultType, args, stmtCtx);
 }
 
@@ -282,7 +283,7 @@ lowerAssociated(fir::FirOpBuilder &builder, mlir::Location loc,
       builder.create<fir::IsPresentOp>(loc, builder.getI1Type(), targetBase);
   mlir::Type targetType = fir::unwrapRefType(targetBase.getType());
   mlir::Type targetValueType = fir::unwrapPassByRefType(targetType);
-  mlir::Type boxType = targetType.isa<fir::BaseBoxType>()
+  mlir::Type boxType = mlir::isa<fir::BaseBoxType>(targetType)
                            ? targetType
                            : fir::BoxType::get(targetValueType);
   fir::BoxValue targetBox =
diff --git a/flang/lib/Lower/DirectivesCommon.h b/flang/lib/Lower/DirectivesCommon.h
index 3ebf3fd965da..42bd3868196b 100644
--- a/flang/lib/Lower/DirectivesCommon.h
+++ b/flang/lib/Lower/DirectivesCommon.h
@@ -642,14 +642,14 @@ getDataOperandBaseAddr(Fortran::lower::AbstractConverter &converter,
     isPresent =
         builder.create<fir::IsPresentOp>(loc, builder.getI1Type(), rawInput);
 
-  if (auto boxTy =
-          fir::unwrapRefType(symAddr.getType()).dyn_cast<fir::BaseBoxType>()) {
-    if (boxTy.getEleTy().isa<fir::RecordType>())
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(
+          fir::unwrapRefType(symAddr.getType()))) {
+    if (mlir::isa<fir::RecordType>(boxTy.getEleTy()))
       TODO(loc, "derived type");
 
     // Load the box when baseAddr is a `fir.ref<fir.box<T>>` or a
     // `fir.ref<fir.class<T>>` type.
-    if (symAddr.getType().isa<fir::ReferenceType>()) {
+    if (mlir::isa<fir::ReferenceType>(symAddr.getType())) {
       if (Fortran::semantics::IsOptional(sym)) {
         mlir::Value addr =
             builder.genIfOp(loc, {boxTy}, isPresent, /*withElseRegion=*/true)
@@ -722,7 +722,7 @@ genBoundsOpsFromBox(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::Type idxTy = builder.getIndexType();
   mlir::Type boundTy = builder.getType<BoundsType>();
 
-  assert(info.addr.getType().isa<fir::BaseBoxType>() &&
+  assert(mlir::isa<fir::BaseBoxType>(info.addr.getType()) &&
          "expect fir.box or fir.class");
 
   if (info.isPresent) {
@@ -909,7 +909,8 @@ genBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc,
       mlir::Value stride = one;
       bool strideInBytes = false;
 
-      if (fir::unwrapRefType(info.addr.getType()).isa<fir::BaseBoxType>()) {
+      if (mlir::isa<fir::BaseBoxType>(
+              fir::unwrapRefType(info.addr.getType()))) {
         if (info.isPresent) {
           stride =
               builder
@@ -1020,8 +1021,8 @@ genBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc,
           }
         }
 
-        if (info.isPresent &&
-            fir::unwrapRefType(info.addr.getType()).isa<fir::BaseBoxType>()) {
+        if (info.isPresent && mlir::isa<fir::BaseBoxType>(
+                                  fir::unwrapRefType(info.addr.getType()))) {
           extent =
               builder
                   .genIfOp(loc, idxTy, info.isPresent, /*withElseRegion=*/true)
@@ -1157,7 +1158,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
         converter.genExprAddr(operandLocation, designator, stmtCtx);
     info.addr = fir::getBase(compExv);
     info.rawInput = info.addr;
-    if (fir::unwrapRefType(info.addr.getType()).isa<fir::SequenceType>())
+    if (mlir::isa<fir::SequenceType>(fir::unwrapRefType(info.addr.getType())))
       bounds = genBaseBoundsOps<BoundsOp, BoundsType>(builder, operandLocation,
                                                       converter, compExv,
                                                       /*isAssumedSize=*/false);
@@ -1199,13 +1200,14 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
       fir::ExtendedValue dataExv = converter.getSymbolExtendedValue(*symRef);
       info =
           getDataOperandBaseAddr(converter, builder, *symRef, operandLocation);
-      if (fir::unwrapRefType(info.addr.getType()).isa<fir::BaseBoxType>()) {
+      if (mlir::isa<fir::BaseBoxType>(
+              fir::unwrapRefType(info.addr.getType()))) {
         bounds = genBoundsOpsFromBox<BoundsOp, BoundsType>(
             builder, operandLocation, converter, dataExv, info);
       }
       bool dataExvIsAssumedSize =
           Fortran::semantics::IsAssumedSizeArray(symRef->get().GetUltimate());
-      if (fir::unwrapRefType(info.addr.getType()).isa<fir::SequenceType>())
+      if (mlir::isa<fir::SequenceType>(fir::unwrapRefType(info.addr.getType())))
         bounds = genBaseBoundsOps<BoundsOp, BoundsType>(
             builder, operandLocation, converter, dataExv, dataExvIsAssumedSize);
       asFortran << symRef->get().name().ToString();
diff --git a/flang/lib/Lower/HlfirIntrinsics.cpp b/flang/lib/Lower/HlfirIntrinsics.cpp
index bda04fa9689b..310b62697f71 100644
--- a/flang/lib/Lower/HlfirIntrinsics.cpp
+++ b/flang/lib/Lower/HlfirIntrinsics.cpp
@@ -265,7 +265,7 @@ HlfirTransformationalIntrinsic::computeResultType(mlir::Value argArray,
                                                   mlir::Type stmtResultType) {
   mlir::Type normalisedResult =
       hlfir::getFortranElementOrSequenceType(stmtResultType);
-  if (auto array = normalisedResult.dyn_cast<fir::SequenceType>()) {
+  if (auto array = mlir::dyn_cast<fir::SequenceType>(normalisedResult)) {
     hlfir::ExprType::Shape resultShape =
         hlfir::ExprType::Shape{array.getShape()};
     mlir::Type elementType = array.getEleTy();
@@ -341,7 +341,7 @@ mlir::Value HlfirTransposeLowering::lowerImpl(
   hlfir::ExprType::Shape resultShape;
   mlir::Type normalisedResult =
       hlfir::getFortranElementOrSequenceType(stmtResultType);
-  auto array = normalisedResult.cast<fir::SequenceType>();
+  auto array = mlir::cast<fir::SequenceType>(normalisedResult);
   llvm::ArrayRef<int64_t> arrayShape = array.getShape();
   assert(arrayShape.size() == 2 && "arguments to transpose have a rank of 2");
   mlir::Type elementType = array.getEleTy();
diff --git a/flang/lib/Lower/HostAssociations.cpp b/flang/lib/Lower/HostAssociations.cpp
index 2e2656356719..75a5bed56655 100644
--- a/flang/lib/Lower/HostAssociations.cpp
+++ b/flang/lib/Lower/HostAssociations.cpp
@@ -219,7 +219,7 @@ public:
   static mlir::Type getType(Fortran::lower::AbstractConverter &converter,
                             const Fortran::semantics::Symbol &sym) {
     fir::KindTy kind =
-        converter.genType(sym).cast<fir::CharacterType>().getFKind();
+        mlir::cast<fir::CharacterType>(converter.genType(sym)).getFKind();
     return fir::BoxCharType::get(&converter.getMLIRContext(), kind);
   }
 
@@ -293,7 +293,7 @@ public:
     mlir::Location loc = args.loc;
     mlir::Value box = args.valueInTuple;
     if (Fortran::semantics::IsOptional(sym)) {
-      auto boxTy = box.getType().cast<fir::BaseBoxType>();
+      auto boxTy = mlir::cast<fir::BaseBoxType>(box.getType());
       auto eleTy = boxTy.getEleTy();
       if (!fir::isa_ref_type(eleTy))
         eleTy = builder.getRefType(eleTy);
@@ -381,8 +381,8 @@ public:
                             const Fortran::semantics::Symbol &sym) {
     mlir::Type type = converter.genType(sym);
     bool isPolymorphic = Fortran::semantics::IsPolymorphic(sym);
-    assert((type.isa<fir::SequenceType>() ||
-            (isPolymorphic && type.isa<fir::ClassType>())) &&
+    assert((mlir::isa<fir::SequenceType>(type) ||
+            (isPolymorphic && mlir::isa<fir::ClassType>(type))) &&
            "must be a sequence type");
     if (isPolymorphic)
       return type;
@@ -459,7 +459,7 @@ public:
       // (absent boxes are null descriptor addresses, not descriptors containing
       // a null base address).
       if (Fortran::semantics::IsOptional(sym)) {
-        auto boxTy = box.getType().cast<fir::BaseBoxType>();
+        auto boxTy = mlir::cast<fir::BaseBoxType>(box.getType());
         auto eleTy = boxTy.getEleTy();
         if (!fir::isa_ref_type(eleTy))
           eleTy = builder.getRefType(eleTy);
@@ -527,7 +527,7 @@ walkCaptureCategories(T visitor, Fortran::lower::AbstractConverter &converter,
 // `t` should be the result of getArgumentType, which has a type of
 // `!fir.ref<tuple<...>>`.
 static mlir::TupleType unwrapTupleTy(mlir::Type t) {
-  return fir::dyn_cast_ptrEleTy(t).cast<mlir::TupleType>();
+  return mlir::cast<mlir::TupleType>(fir::dyn_cast_ptrEleTy(t));
 }
 
 static mlir::Value genTupleCoor(fir::FirOpBuilder &builder, mlir::Location loc,
@@ -535,7 +535,7 @@ static mlir::Value genTupleCoor(fir::FirOpBuilder &builder, mlir::Location loc,
                                 mlir::Value offset) {
   // fir.ref<fir.ref> and fir.ptr<fir.ref> are forbidden. Use
   // fir.llvm_ptr if needed.
-  auto ty = varTy.isa<fir::ReferenceType>()
+  auto ty = mlir::isa<fir::ReferenceType>(varTy)
                 ? mlir::Type(fir::LLVMPointerType::get(varTy))
                 : mlir::Type(builder.getRefType(varTy));
   return builder.create<fir::CoordinateOp>(loc, ty, tupleArg, offset);
diff --git a/flang/lib/Lower/IO.cpp b/flang/lib/Lower/IO.cpp
index ac82276bcddb..ed0afad9197d 100644
--- a/flang/lib/Lower/IO.cpp
+++ b/flang/lib/Lower/IO.cpp
@@ -168,7 +168,7 @@ static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
 }
 
 inline int64_t getLength(mlir::Type argTy) {
-  return argTy.cast<fir::SequenceType>().getShape()[0];
+  return mlir::cast<fir::SequenceType>(argTy).getShape()[0];
 }
 
 /// Get (or generate) the MLIR FuncOp for a given IO runtime function.
@@ -656,11 +656,11 @@ static void genNamelistIO(Fortran::lower::AbstractConverter &converter,
 static mlir::func::FuncOp getOutputFunc(mlir::Location loc,
                                         fir::FirOpBuilder &builder,
                                         mlir::Type type, bool isFormatted) {
-  if (fir::unwrapPassByRefType(type).isa<fir::RecordType>())
+  if (mlir::isa<fir::RecordType>(fir::unwrapPassByRefType(type)))
     return getIORuntimeFunc<mkIOKey(OutputDerivedType)>(loc, builder);
   if (!isFormatted)
     return getIORuntimeFunc<mkIOKey(OutputDescriptor)>(loc, builder);
-  if (auto ty = type.dyn_cast<mlir::IntegerType>()) {
+  if (auto ty = mlir::dyn_cast<mlir::IntegerType>(type)) {
     switch (ty.getWidth()) {
     case 1:
       return getIORuntimeFunc<mkIOKey(OutputLogical)>(loc, builder);
@@ -677,14 +677,14 @@ static mlir::func::FuncOp getOutputFunc(mlir::Location loc,
     }
     llvm_unreachable("unknown OutputInteger kind");
   }
-  if (auto ty = type.dyn_cast<mlir::FloatType>()) {
+  if (auto ty = mlir::dyn_cast<mlir::FloatType>(type)) {
     if (auto width = ty.getWidth(); width == 32)
       return getIORuntimeFunc<mkIOKey(OutputReal32)>(loc, builder);
     else if (width == 64)
       return getIORuntimeFunc<mkIOKey(OutputReal64)>(loc, builder);
   }
   auto kindMap = fir::getKindMapping(builder.getModule());
-  if (auto ty = type.dyn_cast<fir::ComplexType>()) {
+  if (auto ty = mlir::dyn_cast<fir::ComplexType>(type)) {
     // COMPLEX(KIND=k) corresponds to a pair of REAL(KIND=k).
     auto width = kindMap.getRealBitsize(ty.getFKind());
     if (width == 32)
@@ -692,7 +692,7 @@ static mlir::func::FuncOp getOutputFunc(mlir::Location loc,
     else if (width == 64)
       return getIORuntimeFunc<mkIOKey(OutputComplex64)>(loc, builder);
   }
-  if (type.isa<fir::LogicalType>())
+  if (mlir::isa<fir::LogicalType>(type))
     return getIORuntimeFunc<mkIOKey(OutputLogical)>(loc, builder);
   if (fir::factory::CharacterExprHelper::isCharacterScalar(type)) {
     // TODO: What would it mean if the default CHARACTER KIND is set to a wide
@@ -731,14 +731,14 @@ static void genOutputItemList(
     mlir::func::FuncOp outputFunc =
         getOutputFunc(loc, builder, itemTy, isFormatted);
     mlir::Type argType = outputFunc.getFunctionType().getInput(1);
-    assert((isFormatted || argType.isa<fir::BoxType>()) &&
+    assert((isFormatted || mlir::isa<fir::BoxType>(argType)) &&
            "expect descriptor for unformatted IO runtime");
     llvm::SmallVector<mlir::Value> outputFuncArgs = {cookie};
     fir::factory::CharacterExprHelper helper{builder, loc};
-    if (argType.isa<fir::BoxType>()) {
+    if (mlir::isa<fir::BoxType>(argType)) {
       mlir::Value box = fir::getBase(converter.genExprBox(loc, *expr, stmtCtx));
       outputFuncArgs.push_back(builder.createConvert(loc, argType, box));
-      if (fir::unwrapPassByRefType(itemTy).isa<fir::RecordType>())
+      if (mlir::isa<fir::RecordType>(fir::unwrapPassByRefType(itemTy)))
         outputFuncArgs.push_back(getNonTbpDefinedIoTableAddr(converter));
     } else if (helper.isCharacterScalar(itemTy)) {
       fir::ExtendedValue exv = converter.genExprAddr(loc, expr, stmtCtx);
@@ -773,29 +773,29 @@ static void genOutputItemList(
 static mlir::func::FuncOp getInputFunc(mlir::Location loc,
                                        fir::FirOpBuilder &builder,
                                        mlir::Type type, bool isFormatted) {
-  if (fir::unwrapPassByRefType(type).isa<fir::RecordType>())
+  if (mlir::isa<fir::RecordType>(fir::unwrapPassByRefType(type)))
     return getIORuntimeFunc<mkIOKey(InputDerivedType)>(loc, builder);
   if (!isFormatted)
     return getIORuntimeFunc<mkIOKey(InputDescriptor)>(loc, builder);
-  if (auto ty = type.dyn_cast<mlir::IntegerType>())
+  if (auto ty = mlir::dyn_cast<mlir::IntegerType>(type))
     return ty.getWidth() == 1
                ? getIORuntimeFunc<mkIOKey(InputLogical)>(loc, builder)
                : getIORuntimeFunc<mkIOKey(InputInteger)>(loc, builder);
-  if (auto ty = type.dyn_cast<mlir::FloatType>()) {
+  if (auto ty = mlir::dyn_cast<mlir::FloatType>(type)) {
     if (auto width = ty.getWidth(); width == 32)
       return getIORuntimeFunc<mkIOKey(InputReal32)>(loc, builder);
     else if (width == 64)
       return getIORuntimeFunc<mkIOKey(InputReal64)>(loc, builder);
   }
   auto kindMap = fir::getKindMapping(builder.getModule());
-  if (auto ty = type.dyn_cast<fir::ComplexType>()) {
+  if (auto ty = mlir::dyn_cast<fir::ComplexType>(type)) {
     auto width = kindMap.getRealBitsize(ty.getFKind());
     if (width == 32)
       return getIORuntimeFunc<mkIOKey(InputComplex32)>(loc, builder);
     else if (width == 64)
       return getIORuntimeFunc<mkIOKey(InputComplex64)>(loc, builder);
   }
-  if (type.isa<fir::LogicalType>())
+  if (mlir::isa<fir::LogicalType>(type))
     return getIORuntimeFunc<mkIOKey(InputLogical)>(loc, builder);
   if (fir::factory::CharacterExprHelper::isCharacterScalar(type)) {
     auto asciiKind = kindMap.defaultCharacterKind();
@@ -830,12 +830,12 @@ createIoRuntimeCallForItem(Fortran::lower::AbstractConverter &converter,
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   mlir::Type argType = inputFunc.getFunctionType().getInput(1);
   llvm::SmallVector<mlir::Value> inputFuncArgs = {cookie};
-  if (argType.isa<fir::BaseBoxType>()) {
+  if (mlir::isa<fir::BaseBoxType>(argType)) {
     mlir::Value box = fir::getBase(item);
-    auto boxTy = box.getType().dyn_cast<fir::BaseBoxType>();
+    auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(box.getType());
     assert(boxTy && "must be previously emboxed");
     inputFuncArgs.push_back(builder.createConvert(loc, argType, box));
-    if (fir::unwrapPassByRefType(boxTy).isa<fir::RecordType>())
+    if (mlir::isa<fir::RecordType>(fir::unwrapPassByRefType(boxTy)))
       inputFuncArgs.push_back(getNonTbpDefinedIoTableAddr(converter));
   } else {
     mlir::Value itemAddr = fir::getBase(item);
@@ -846,16 +846,16 @@ createIoRuntimeCallForItem(Fortran::lower::AbstractConverter &converter,
       mlir::Value len = fir::getLen(item);
       inputFuncArgs.push_back(builder.createConvert(
           loc, inputFunc.getFunctionType().getInput(2), len));
-    } else if (itemTy.isa<mlir::IntegerType>()) {
+    } else if (mlir::isa<mlir::IntegerType>(itemTy)) {
       inputFuncArgs.push_back(builder.create<mlir::arith::ConstantOp>(
           loc, builder.getI32IntegerAttr(
-                   itemTy.cast<mlir::IntegerType>().getWidth() / 8)));
+                   mlir::cast<mlir::IntegerType>(itemTy).getWidth() / 8)));
     }
   }
   auto call = builder.create<fir::CallOp>(loc, inputFunc, inputFuncArgs);
   auto itemAddr = fir::getBase(item);
   auto itemTy = fir::unwrapRefType(itemAddr.getType());
-  if (itemTy.isa<fir::LogicalType>())
+  if (mlir::isa<fir::LogicalType>(itemTy))
     boolRefToLogical(loc, builder, itemAddr);
   return call.getResult(0);
 }
@@ -886,7 +886,7 @@ static void genInputItemList(Fortran::lower::AbstractConverter &converter,
       mlir::func::FuncOp inputFunc = getInputFunc(
           loc, builder, vectorSubscriptBox.getElementType(), isFormatted);
       const bool mustBox =
-          inputFunc.getFunctionType().getInput(1).isa<fir::BoxType>();
+          mlir::isa<fir::BoxType>(inputFunc.getFunctionType().getInput(1));
       if (!checkResult) {
         auto elementalGenerator = [&](const fir::ExtendedValue &element) {
           createIoRuntimeCallForItem(converter, loc, inputFunc, cookie,
@@ -911,9 +911,10 @@ static void genInputItemList(Fortran::lower::AbstractConverter &converter,
     mlir::Type itemTy = converter.genType(*expr);
     mlir::func::FuncOp inputFunc =
         getInputFunc(loc, builder, itemTy, isFormatted);
-    auto itemExv = inputFunc.getFunctionType().getInput(1).isa<fir::BoxType>()
-                       ? converter.genExprBox(loc, *expr, stmtCtx)
-                       : converter.genExprAddr(loc, expr, stmtCtx);
+    auto itemExv =
+        mlir::isa<fir::BoxType>(inputFunc.getFunctionType().getInput(1))
+            ? converter.genExprBox(loc, *expr, stmtCtx)
+            : converter.genExprAddr(loc, expr, stmtCtx);
     ok = createIoRuntimeCallForItem(converter, loc, inputFunc, cookie, itemExv);
   }
 }
@@ -1772,8 +1773,8 @@ static mlir::Value genIOUnitNumber(Fortran::lower::AbstractConverter &converter,
   auto &builder = converter.getFirOpBuilder();
   auto rawUnit = fir::getBase(converter.genExprValue(loc, iounit, stmtCtx));
   unsigned rawUnitWidth =
-      rawUnit.getType().cast<mlir::IntegerType>().getWidth();
-  unsigned runtimeArgWidth = ty.cast<mlir::IntegerType>().getWidth();
+      mlir::cast<mlir::IntegerType>(rawUnit.getType()).getWidth();
+  unsigned runtimeArgWidth = mlir::cast<mlir::IntegerType>(ty).getWidth();
   // The IO runtime supports `int` unit numbers, if the unit number may
   // overflow when passed to the IO runtime, check that the unit number is
   // in range before calling the BeginXXX.
@@ -2331,7 +2332,7 @@ mlir::Value genInquireSpec<Fortran::parser::InquireSpec::IntVar>(
   if (!eleTy)
     fir::emitFatalError(loc,
                         "internal error: expected a memory reference type");
-  auto width = eleTy.cast<mlir::IntegerType>().getWidth();
+  auto width = mlir::cast<mlir::IntegerType>(eleTy).getWidth();
   mlir::IndexType idxTy = builder.getIndexType();
   mlir::Value kind = builder.createIntegerConstant(loc, idxTy, width / 8);
   llvm::SmallVector<mlir::Value> args = {
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index b56bdedc07bf..eae2afc760e6 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -65,7 +65,7 @@ static Op createDataEntryOp(fir::FirOpBuilder &builder, mlir::Location loc,
                             mlir::acc::DataClause dataClause, mlir::Type retTy,
                             mlir::Value isPresent = {}) {
   mlir::Value varPtrPtr;
-  if (auto boxTy = baseAddr.getType().dyn_cast<fir::BaseBoxType>()) {
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(baseAddr.getType())) {
     if (isPresent) {
       mlir::Type ifRetTy = boxTy.getEleTy();
       if (!fir::isa_ref_type(ifRetTy))
@@ -2658,7 +2658,7 @@ genACCHostDataOp(Fortran::lower::AbstractConverter &converter,
   if (ifCond) {
     if (auto cst =
             mlir::dyn_cast<mlir::arith::ConstantOp>(ifCond.getDefiningOp()))
-      if (auto boolAttr = cst.getValue().dyn_cast<mlir::BoolAttr>()) {
+      if (auto boolAttr = mlir::dyn_cast<mlir::BoolAttr>(cst.getValue())) {
         if (boolAttr.getValue()) {
           // get rid of the if condition if it is always true.
           ifCond = mlir::Value();
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 4c51b61f6bf0..79525d6dfe7a 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -23,10 +23,10 @@ namespace omp {
 
 /// Check for unsupported map operand types.
 static void checkMapType(mlir::Location location, mlir::Type type) {
-  if (auto refType = type.dyn_cast<fir::ReferenceType>())
+  if (auto refType = mlir::dyn_cast<fir::ReferenceType>(type))
     type = refType.getElementType();
-  if (auto boxType = type.dyn_cast_or_null<fir::BoxType>())
-    if (!boxType.getElementType().isa<fir::PointerType>())
+  if (auto boxType = mlir::dyn_cast_or_null<fir::BoxType>(type))
+    if (!mlir::isa<fir::PointerType>(boxType.getElementType()))
       TODO(location, "OMPD_target_data MapOperand BoxType");
 }
 
@@ -814,7 +814,7 @@ createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
                 llvm::ArrayRef<mlir::Value> members, uint64_t mapType,
                 mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy,
                 bool isVal) {
-  if (auto boxTy = baseAddr.getType().dyn_cast<fir::BaseBoxType>()) {
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(baseAddr.getType())) {
     baseAddr = builder.create<fir::BoxAddrOp>(loc, baseAddr);
     retTy = baseAddr.getType();
   }
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index b419686e8ce4..d94c32375c0d 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -339,6 +339,7 @@ void DataSharingProcessor::defaultPrivatize(
     if (!Fortran::semantics::IsProcedure(*sym) &&
         !sym->GetUltimate().has<Fortran::semantics::DerivedTypeDetails>() &&
         !sym->GetUltimate().has<Fortran::semantics::NamelistDetails>() &&
+        !Fortran::semantics::IsImpliedDoIndex(sym->GetUltimate()) &&
         !symbolsInNestedRegions.contains(sym) &&
         !symbolsInParentRegions.contains(sym) &&
         !privatizedSymbols.contains(sym))
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index f454f5a45a51..64ec5ae65c82 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -84,7 +84,7 @@ static fir::GlobalOp globalInitialization(
 
   // Create default initialization for non-character scalar.
   if (Fortran::semantics::IsAllocatableOrObjectPointer(&sym)) {
-    mlir::Type baseAddrType = ty.dyn_cast<fir::BoxType>().getEleTy();
+    mlir::Type baseAddrType = mlir::dyn_cast<fir::BoxType>(ty).getEleTy();
     Fortran::lower::createGlobalInitialization(
         firOpBuilder, global, [&](fir::FirOpBuilder &b) {
           mlir::Value nullAddr =
@@ -778,7 +778,7 @@ static void genBodyOfTargetDataOp(
   for (auto [argIndex, argSymbol] : llvm::enumerate(useDeviceSymbols)) {
     const mlir::BlockArgument &arg = region.front().getArgument(argIndex);
     fir::ExtendedValue extVal = converter.getSymbolExtendedValue(*argSymbol);
-    if (auto refType = arg.getType().dyn_cast<fir::ReferenceType>()) {
+    if (auto refType = mlir::dyn_cast<fir::ReferenceType>(arg.getType())) {
       if (fir::isa_builtin_cptr_type(refType.getElementType())) {
         converter.bindSymbol(*argSymbol, arg);
       } else {
@@ -1570,13 +1570,15 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
 
         Fortran::lower::AddrAndBoundsInfo info = getDataOperandBaseAddr(
             converter, firOpBuilder, sym, converter.getCurrentLocation());
-        if (fir::unwrapRefType(info.addr.getType()).isa<fir::BaseBoxType>())
+        if (mlir::isa<fir::BaseBoxType>(
+                fir::unwrapRefType(info.addr.getType())))
           bounds =
               Fortran::lower::genBoundsOpsFromBox<mlir::omp::MapBoundsOp,
                                                   mlir::omp::MapBoundsType>(
                   firOpBuilder, converter.getCurrentLocation(), converter,
                   dataExv, info);
-        if (fir::unwrapRefType(info.addr.getType()).isa<fir::SequenceType>()) {
+        if (mlir::isa<fir::SequenceType>(
+                fir::unwrapRefType(info.addr.getType()))) {
           bool dataExvIsAssumedSize =
               Fortran::semantics::IsAssumedSizeArray(sym.GetUltimate());
           bounds = Fortran::lower::genBaseBoundsOps<mlir::omp::MapBoundsOp,
@@ -1591,7 +1593,7 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
             mlir::omp::VariableCaptureKind::ByRef;
 
         mlir::Type eleType = baseOp.getType();
-        if (auto refType = baseOp.getType().dyn_cast<fir::ReferenceType>())
+        if (auto refType = mlir::dyn_cast<fir::ReferenceType>(baseOp.getType()))
           eleType = refType.getElementType();
 
         // If a variable is specified in declare target link and if device
diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
index 895340549f7c..38edd1b46821 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
@@ -138,7 +138,7 @@ ReductionProcessor::getReductionInitValue(mlir::Location loc, mlir::Type type,
     TODO(loc, "Reduction of some types is not supported");
   switch (redId) {
   case ReductionIdentifier::MAX: {
-    if (auto ty = type.dyn_cast<mlir::FloatType>()) {
+    if (auto ty = mlir::dyn_cast<mlir::FloatType>(type)) {
       const llvm::fltSemantics &sem = ty.getFloatSemantics();
       return builder.createRealConstant(
           loc, type, llvm::APFloat::getLargest(sem, /*Negative=*/true));
@@ -148,7 +148,7 @@ ReductionProcessor::getReductionInitValue(mlir::Location loc, mlir::Type type,
     return builder.createIntegerConstant(loc, type, minInt);
   }
   case ReductionIdentifier::MIN: {
-    if (auto ty = type.dyn_cast<mlir::FloatType>()) {
+    if (auto ty = mlir::dyn_cast<mlir::FloatType>(type)) {
       const llvm::fltSemantics &sem = ty.getFloatSemantics();
       return builder.createRealConstant(
           loc, type, llvm::APFloat::getLargest(sem, /*Negative=*/false));
@@ -188,12 +188,12 @@ ReductionProcessor::getReductionInitValue(mlir::Location loc, mlir::Type type,
       return fir::factory::Complex{builder, loc}.createComplex(type, initRe,
                                                                initIm);
     }
-    if (type.isa<mlir::FloatType>())
+    if (mlir::isa<mlir::FloatType>(type))
       return builder.create<mlir::arith::ConstantOp>(
           loc, type,
           builder.getFloatAttr(type, (double)getOperationIdentity(redId, loc)));
 
-    if (type.isa<fir::LogicalType>()) {
+    if (mlir::isa<fir::LogicalType>(type)) {
       mlir::Value intConst = builder.create<mlir::arith::ConstantOp>(
           loc, builder.getI1Type(),
           builder.getIntegerAttr(builder.getI1Type(),
@@ -474,11 +474,11 @@ createReductionCleanupRegion(fir::FirOpBuilder &builder, mlir::Location loc,
 
 // like fir::unwrapSeqOrBoxedSeqType except it also works for non-sequence boxes
 static mlir::Type unwrapSeqOrBoxedType(mlir::Type ty) {
-  if (auto seqTy = ty.dyn_cast<fir::SequenceType>())
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
     return seqTy.getEleTy();
-  if (auto boxTy = ty.dyn_cast<fir::BaseBoxType>()) {
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty)) {
     auto eleTy = fir::unwrapRefType(boxTy.getEleTy());
-    if (auto seqTy = eleTy.dyn_cast<fir::SequenceType>())
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(eleTy))
       return seqTy.getEleTy();
     return eleTy;
   }
@@ -790,7 +790,7 @@ void ReductionProcessor::addDeclareReduction(
     for (mlir::Value symVal : reductionVars) {
       auto redType = mlir::cast<fir::ReferenceType>(symVal.getType());
       const auto &kindMap = firOpBuilder.getKindMap();
-      if (redType.getEleTy().isa<fir::LogicalType>())
+      if (mlir::isa<fir::LogicalType>(redType.getEleTy()))
         decl = createDeclareReduction(firOpBuilder,
                                       getReductionName(intrinsicOp, kindMap,
                                                        firOpBuilder.getI1Type(),
@@ -816,7 +816,7 @@ void ReductionProcessor::addDeclareReduction(
         mlir::Value symVal = converter.getSymbolAddress(*symbol);
         if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
           symVal = declOp.getBase();
-        auto redType = symVal.getType().cast<fir::ReferenceType>();
+        auto redType = mlir::cast<fir::ReferenceType>(symVal.getType());
         if (!redType.getEleTy().isIntOrIndexOrFloat())
           TODO(currentLocation, "User Defined Reduction on non-trivial type");
         decl = createDeclareReduction(
diff --git a/flang/lib/Lower/VectorSubscripts.cpp b/flang/lib/Lower/VectorSubscripts.cpp
index 7439b9f7df8f..d7a311d32d59 100644
--- a/flang/lib/Lower/VectorSubscripts.cpp
+++ b/flang/lib/Lower/VectorSubscripts.cpp
@@ -105,7 +105,7 @@ private:
   }
 
   mlir::Type gen(const Fortran::evaluate::Component &component) {
-    auto recTy = gen(component.base()).cast<fir::RecordType>();
+    auto recTy = mlir::cast<fir::RecordType>(gen(component.base()));
     const Fortran::semantics::Symbol &componentSymbol =
         component.GetLastSymbol();
     // Parent components will not be found here, they are not part
diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
index c403b9effbfa..f723e8f66e3e 100644
--- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
+++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
@@ -68,7 +68,7 @@ bool AliasAnalysis::Source::isPointerReference(mlir::Type ty) {
   if (!eleTy)
     return false;
 
-  return fir::isPointerType(eleTy) || eleTy.isa<fir::PointerType>();
+  return fir::isPointerType(eleTy) || mlir::isa<fir::PointerType>(eleTy);
 }
 
 bool AliasAnalysis::Source::isTargetOrPointer() const {
@@ -81,7 +81,7 @@ bool AliasAnalysis::Source::isRecordWithPointerComponent() const {
   if (!eleTy)
     return false;
   // TO DO: Look for pointer components
-  return eleTy.isa<fir::RecordType>();
+  return mlir::isa<fir::RecordType>(eleTy);
 }
 
 AliasResult AliasAnalysis::alias(Value lhs, Value rhs) {
diff --git a/flang/lib/Optimizer/Builder/BoxValue.cpp b/flang/lib/Optimizer/Builder/BoxValue.cpp
index 361fa59e2040..a90ce5570de7 100644
--- a/flang/lib/Optimizer/Builder/BoxValue.cpp
+++ b/flang/lib/Optimizer/Builder/BoxValue.cpp
@@ -191,7 +191,7 @@ bool fir::MutableBoxValue::verify() const {
   mlir::Type type = fir::dyn_cast_ptrEleTy(getAddr().getType());
   if (!type)
     return false;
-  auto box = type.dyn_cast<fir::BaseBoxType>();
+  auto box = mlir::dyn_cast<fir::BaseBoxType>(type);
   if (!box)
     return false;
   // A boxed value always takes a memory reference,
@@ -210,7 +210,7 @@ bool fir::MutableBoxValue::verify() const {
 /// Debug verifier for BoxValue ctor. There is no guarantee this will
 /// always be called.
 bool fir::BoxValue::verify() const {
-  if (!addr.getType().isa<fir::BaseBoxType>())
+  if (!mlir::isa<fir::BaseBoxType>(addr.getType()))
     return false;
   if (!lbounds.empty() && lbounds.size() != rank())
     return false;
diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt
index 06339b116cd8..6d0aeb429d35 100644
--- a/flang/lib/Optimizer/Builder/CMakeLists.txt
+++ b/flang/lib/Optimizer/Builder/CMakeLists.txt
@@ -23,6 +23,7 @@ add_flang_library(FIRBuilder
   Runtime/Execute.cpp
   Runtime/Inquiry.cpp
   Runtime/Intrinsics.cpp
+  Runtime/Main.cpp
   Runtime/Numeric.cpp
   Runtime/Pointer.cpp
   Runtime/Ragged.cpp
diff --git a/flang/lib/Optimizer/Builder/Character.cpp b/flang/lib/Optimizer/Builder/Character.cpp
index af0786809cc2..b7a7453efdb3 100644
--- a/flang/lib/Optimizer/Builder/Character.cpp
+++ b/flang/lib/Optimizer/Builder/Character.cpp
@@ -26,11 +26,11 @@
 
 /// Unwrap all the ref and box types and return the inner element type.
 static mlir::Type unwrapBoxAndRef(mlir::Type type) {
-  if (auto boxType = type.dyn_cast<fir::BoxCharType>())
+  if (auto boxType = mlir::dyn_cast<fir::BoxCharType>(type))
     return boxType.getEleTy();
   while (true) {
     type = fir::unwrapRefType(type);
-    if (auto boxTy = type.dyn_cast<fir::BoxType>())
+    if (auto boxTy = mlir::dyn_cast<fir::BoxType>(type))
       type = boxTy.getEleTy();
     else
       break;
@@ -41,19 +41,19 @@ static mlir::Type unwrapBoxAndRef(mlir::Type type) {
 /// Unwrap base fir.char<kind,len> type.
 static fir::CharacterType recoverCharacterType(mlir::Type type) {
   type = fir::unwrapSequenceType(unwrapBoxAndRef(type));
-  if (auto charTy = type.dyn_cast<fir::CharacterType>())
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(type))
     return charTy;
   llvm::report_fatal_error("expected a character type");
 }
 
 bool fir::factory::CharacterExprHelper::isCharacterScalar(mlir::Type type) {
   type = unwrapBoxAndRef(type);
-  return !type.isa<fir::SequenceType>() && fir::isa_char(type);
+  return !mlir::isa<fir::SequenceType>(type) && fir::isa_char(type);
 }
 
 bool fir::factory::CharacterExprHelper::isArray(mlir::Type type) {
   type = unwrapBoxAndRef(type);
-  if (auto seqTy = type.dyn_cast<fir::SequenceType>())
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(type))
     return fir::isa_char(seqTy.getEleTy());
   return false;
 }
@@ -92,7 +92,8 @@ getCompileTimeLength(const fir::CharBoxValue &box) {
 /// Detect the precondition that the value `str` does not reside in memory. Such
 /// values will have a type `!fir.array<...x!fir.char<N>>` or `!fir.char<N>`.
 LLVM_ATTRIBUTE_UNUSED static bool needToMaterialize(mlir::Value str) {
-  return str.getType().isa<fir::SequenceType>() || fir::isa_char(str.getType());
+  return mlir::isa<fir::SequenceType>(str.getType()) ||
+         fir::isa_char(str.getType());
 }
 
 /// This is called only if `str` does not reside in memory. Such a bare string
@@ -103,7 +104,7 @@ fir::factory::CharacterExprHelper::materializeValue(mlir::Value str) {
   assert(needToMaterialize(str));
   auto ty = str.getType();
   assert(isCharacterScalar(ty) && "expected scalar character");
-  auto charTy = ty.dyn_cast<fir::CharacterType>();
+  auto charTy = mlir::dyn_cast<fir::CharacterType>(ty);
   if (!charTy || charTy.getLen() == fir::CharacterType::unknownLen()) {
     LLVM_DEBUG(llvm::dbgs() << "cannot materialize: " << str << '\n');
     llvm_unreachable("must be a !fir.char<N> type");
@@ -129,7 +130,7 @@ fir::factory::CharacterExprHelper::toExtendedValue(mlir::Value character,
   if (auto eleType = fir::dyn_cast_ptrEleTy(type))
     type = eleType;
 
-  if (auto arrayType = type.dyn_cast<fir::SequenceType>()) {
+  if (auto arrayType = mlir::dyn_cast<fir::SequenceType>(type)) {
     type = arrayType.getEleTy();
     auto indexType = builder.getIndexType();
     for (auto extent : arrayType.getShape()) {
@@ -145,10 +146,10 @@ fir::factory::CharacterExprHelper::toExtendedValue(mlir::Value character,
       mlir::emitError(loc, "cannot retrieve array extents from type");
   }
 
-  if (auto charTy = type.dyn_cast<fir::CharacterType>()) {
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(type)) {
     if (!resultLen && charTy.getLen() != fir::CharacterType::unknownLen())
       resultLen = builder.createIntegerConstant(loc, lenType, charTy.getLen());
-  } else if (auto boxCharType = type.dyn_cast<fir::BoxCharType>()) {
+  } else if (auto boxCharType = mlir::dyn_cast<fir::BoxCharType>(type)) {
     auto refType = builder.getRefType(boxCharType.getEleTy());
     // If the embox is accessible, use its operand to avoid filling
     // the generated fir with embox/unbox.
@@ -168,7 +169,7 @@ fir::factory::CharacterExprHelper::toExtendedValue(mlir::Value character,
     if (!resultLen) {
       resultLen = boxCharLen;
     }
-  } else if (type.isa<fir::BoxType>()) {
+  } else if (mlir::isa<fir::BoxType>(type)) {
     mlir::emitError(loc, "descriptor or derived type not yet handled");
   } else {
     llvm_unreachable("Cannot translate mlir::Value to character ExtendedValue");
@@ -221,7 +222,7 @@ fir::factory::CharacterExprHelper::createEmbox(const fir::CharBoxValue &box) {
 
 fir::CharBoxValue fir::factory::CharacterExprHelper::toScalarCharacter(
     const fir::CharArrayBoxValue &box) {
-  if (box.getBuffer().getType().isa<fir::PointerType>())
+  if (mlir::isa<fir::PointerType>(box.getBuffer().getType()))
     TODO(loc, "concatenating non contiguous character array into a scalar");
 
   // TODO: add a fast path multiplying new length at compile time if the info is
@@ -655,7 +656,7 @@ fir::factory::CharacterExprHelper::createUnboxChar(mlir::Value boxChar) {
 }
 
 bool fir::factory::CharacterExprHelper::isCharacterLiteral(mlir::Type type) {
-  if (auto seqType = type.dyn_cast<fir::SequenceType>())
+  if (auto seqType = mlir::dyn_cast<fir::SequenceType>(type))
     return (seqType.getShape().size() == 1) &&
            fir::isa_char(seqType.getEleTy());
   return false;
@@ -728,9 +729,9 @@ mlir::Value fir::factory::CharacterExprHelper::getLength(mlir::Value memref) {
   if (charType.hasConstantLen())
     return builder.createIntegerConstant(loc, builder.getCharacterLengthType(),
                                          charType.getLen());
-  if (memrefType.isa<fir::BoxType>())
+  if (mlir::isa<fir::BoxType>(memrefType))
     return readLengthFromBox(memref);
-  if (memrefType.isa<fir::BoxCharType>())
+  if (mlir::isa<fir::BoxCharType>(memrefType))
     return createUnboxChar(memref).second;
 
   // Length cannot be deduced from memref.
@@ -742,14 +743,14 @@ fir::factory::extractCharacterProcedureTuple(fir::FirOpBuilder &builder,
                                              mlir::Location loc,
                                              mlir::Value tuple,
                                              bool openBoxProc) {
-  mlir::TupleType tupleType = tuple.getType().cast<mlir::TupleType>();
+  mlir::TupleType tupleType = mlir::cast<mlir::TupleType>(tuple.getType());
   mlir::Value addr = builder.create<fir::ExtractValueOp>(
       loc, tupleType.getType(0), tuple,
       builder.getArrayAttr(
           {builder.getIntegerAttr(builder.getIndexType(), 0)}));
   mlir::Value proc = [&]() -> mlir::Value {
     if (openBoxProc)
-      if (auto addrTy = addr.getType().dyn_cast<fir::BoxProcType>())
+      if (auto addrTy = mlir::dyn_cast<fir::BoxProcType>(addr.getType()))
         return builder.create<fir::BoxAddrOp>(loc, addrTy.getEleTy(), addr);
     return addr;
   }();
@@ -763,7 +764,7 @@ fir::factory::extractCharacterProcedureTuple(fir::FirOpBuilder &builder,
 mlir::Value fir::factory::createCharacterProcedureTuple(
     fir::FirOpBuilder &builder, mlir::Location loc, mlir::Type argTy,
     mlir::Value addr, mlir::Value len) {
-  mlir::TupleType tupleType = argTy.cast<mlir::TupleType>();
+  mlir::TupleType tupleType = mlir::cast<mlir::TupleType>(argTy);
   addr = builder.createConvert(loc, tupleType.getType(0), addr);
   if (len)
     len = builder.createConvert(loc, tupleType.getType(1), len);
@@ -866,7 +867,7 @@ fir::factory::convertCharacterKind(fir::FirOpBuilder &builder,
   auto kindMap = builder.getKindMap();
   mlir::Value boxCharAddr = srcBoxChar.getAddr();
   auto fromTy = boxCharAddr.getType();
-  if (auto charTy = fromTy.dyn_cast<fir::CharacterType>()) {
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(fromTy)) {
     // boxchar is a value, not a variable. Turn it into a temporary.
     // As a value, it ought to have a constant LEN value.
     assert(charTy.hasConstantLen() && "must have constant length");
@@ -875,7 +876,7 @@ fir::factory::convertCharacterKind(fir::FirOpBuilder &builder,
     boxCharAddr = tmp;
   }
   auto fromBits = kindMap.getCharacterBitsize(
-      fir::unwrapRefType(fromTy).cast<fir::CharacterType>().getFKind());
+      mlir::cast<fir::CharacterType>(fir::unwrapRefType(fromTy)).getFKind());
   auto toBits = kindMap.getCharacterBitsize(toKind);
   if (toBits < fromBits) {
     // Scale by relative ratio to give a buffer of the same length.
diff --git a/flang/lib/Optimizer/Builder/Complex.cpp b/flang/lib/Optimizer/Builder/Complex.cpp
index e97cb3067808..cbcd4f850014 100644
--- a/flang/lib/Optimizer/Builder/Complex.cpp
+++ b/flang/lib/Optimizer/Builder/Complex.cpp
@@ -14,7 +14,8 @@
 
 mlir::Type
 fir::factory::Complex::getComplexPartType(mlir::Type complexType) const {
-  return builder.getRealType(complexType.cast<fir::ComplexType>().getFKind());
+  return builder.getRealType(
+      mlir::cast<fir::ComplexType>(complexType).getFKind());
 }
 
 mlir::Type fir::factory::Complex::getComplexPartType(mlir::Value cplx) const {
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index a0fbae5b614c..a6da38763726 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -90,7 +90,7 @@ fir::FirOpBuilder::getNamedGlobal(mlir::ModuleOp modOp,
 }
 
 mlir::Type fir::FirOpBuilder::getRefType(mlir::Type eleTy) {
-  assert(!eleTy.isa<fir::ReferenceType>() && "cannot be a reference type");
+  assert(!mlir::isa<fir::ReferenceType>(eleTy) && "cannot be a reference type");
   return fir::ReferenceType::get(eleTy);
 }
 
@@ -147,7 +147,7 @@ mlir::Value
 fir::FirOpBuilder::createRealConstant(mlir::Location loc, mlir::Type fltTy,
                                       llvm::APFloat::integerPart val) {
   auto apf = [&]() -> llvm::APFloat {
-    if (auto ty = fltTy.dyn_cast<fir::RealType>())
+    if (auto ty = mlir::dyn_cast<fir::RealType>(fltTy))
       return llvm::APFloat(kindMap.getFloatSemantics(ty.getFKind()), val);
     if (fltTy.isF16())
       return llvm::APFloat(llvm::APFloat::IEEEhalf(), val);
@@ -169,7 +169,7 @@ fir::FirOpBuilder::createRealConstant(mlir::Location loc, mlir::Type fltTy,
 mlir::Value fir::FirOpBuilder::createRealConstant(mlir::Location loc,
                                                   mlir::Type fltTy,
                                                   const llvm::APFloat &value) {
-  if (fltTy.isa<mlir::FloatType>()) {
+  if (mlir::isa<mlir::FloatType>(fltTy)) {
     auto attr = getFloatAttr(fltTy, value);
     return create<mlir::arith::ConstantOp>(loc, fltTy, attr);
   }
@@ -178,7 +178,7 @@ mlir::Value fir::FirOpBuilder::createRealConstant(mlir::Location loc,
 
 static llvm::SmallVector<mlir::Value>
 elideExtentsAlreadyInType(mlir::Type type, mlir::ValueRange shape) {
-  auto arrTy = type.dyn_cast<fir::SequenceType>();
+  auto arrTy = mlir::dyn_cast<fir::SequenceType>(type);
   if (shape.empty() || !arrTy)
     return {};
   // elide the constant dimensions before construction
@@ -195,7 +195,7 @@ static llvm::SmallVector<mlir::Value>
 elideLengthsAlreadyInType(mlir::Type type, mlir::ValueRange lenParams) {
   if (lenParams.empty())
     return {};
-  if (auto arrTy = type.dyn_cast<fir::SequenceType>())
+  if (auto arrTy = mlir::dyn_cast<fir::SequenceType>(type))
     type = arrTy.getEleTy();
   if (fir::hasDynamicSize(type))
     return lenParams;
@@ -264,7 +264,7 @@ mlir::Value fir::FirOpBuilder::createTemporaryAlloc(
     mlir::Location loc, mlir::Type type, llvm::StringRef name,
     mlir::ValueRange lenParams, mlir::ValueRange shape,
     llvm::ArrayRef<mlir::NamedAttribute> attrs) {
-  assert(!type.isa<fir::ReferenceType>() && "cannot be a reference");
+  assert(!mlir::isa<fir::ReferenceType>(type) && "cannot be a reference");
   // If the alloca is inside an OpenMP Op which will be outlined then pin
   // the alloca here.
   const bool pinned =
@@ -310,7 +310,7 @@ mlir::Value fir::FirOpBuilder::createHeapTemporary(
   llvm::SmallVector<mlir::Value> dynamicLength =
       elideLengthsAlreadyInType(type, lenParams);
 
-  assert(!type.isa<fir::ReferenceType>() && "cannot be a reference");
+  assert(!mlir::isa<fir::ReferenceType>(type) && "cannot be a reference");
   return create<fir::AllocMemOp>(loc, type, /*unique_name=*/llvm::StringRef{},
                                  name, dynamicLength, dynamicShape, attrs);
 }
@@ -376,8 +376,9 @@ mlir::Value fir::FirOpBuilder::convertWithSemantics(
     // imaginary part is zero
     auto eleTy = helper.getComplexPartType(toTy);
     auto cast = createConvert(loc, eleTy, val);
-    llvm::APFloat zero{
-        kindMap.getFloatSemantics(toTy.cast<fir::ComplexType>().getFKind()), 0};
+    llvm::APFloat zero{kindMap.getFloatSemantics(
+                           mlir::cast<fir::ComplexType>(toTy).getFKind()),
+                       0};
     auto imag = createRealConstant(loc, eleTy, zero);
     return helper.createComplex(toTy, cast, imag);
   }
@@ -388,14 +389,14 @@ mlir::Value fir::FirOpBuilder::convertWithSemantics(
     return createConvert(loc, toTy, rp);
   }
   if (allowCharacterConversion) {
-    if (fromTy.isa<fir::BoxCharType>()) {
+    if (mlir::isa<fir::BoxCharType>(fromTy)) {
       // Extract the address of the character string and pass it
       fir::factory::CharacterExprHelper charHelper{*this, loc};
       std::pair<mlir::Value, mlir::Value> unboxchar =
           charHelper.createUnboxChar(val);
       return createConvert(loc, toTy, unboxchar.first);
     }
-    if (auto boxType = toTy.dyn_cast<fir::BoxCharType>()) {
+    if (auto boxType = mlir::dyn_cast<fir::BoxCharType>(toTy)) {
       // Extract the address of the actual argument and create a boxed
       // character value with an undefined length
       // TODO: We should really calculate the total size of the actual
@@ -415,10 +416,10 @@ mlir::Value fir::FirOpBuilder::convertWithSemantics(
             "element types expected to match"));
     return create<fir::BoxAddrOp>(loc, toTy, val);
   }
-  if (fir::isa_ref_type(fromTy) && toTy.isa<fir::BoxProcType>()) {
+  if (fir::isa_ref_type(fromTy) && mlir::isa<fir::BoxProcType>(toTy)) {
     // Call is expecting a boxed procedure, not a reference to other data type.
     // Convert the reference to a procedure and embox it.
-    mlir::Type procTy = toTy.cast<fir::BoxProcType>().getEleTy();
+    mlir::Type procTy = mlir::cast<fir::BoxProcType>(toTy).getEleTy();
     mlir::Value proc = createConvert(loc, procTy, val);
     return create<fir::EmboxProcOp>(loc, toTy, proc);
   }
@@ -428,7 +429,7 @@ mlir::Value fir::FirOpBuilder::convertWithSemantics(
     if (((fir::isPolymorphicType(fromTy) &&
           (fir::isAllocatableType(fromTy) || fir::isPointerType(fromTy)) &&
           fir::isPolymorphicType(toTy)) ||
-         (fir::isPolymorphicType(fromTy) && toTy.isa<fir::BoxType>())) &&
+         (fir::isPolymorphicType(fromTy) && mlir::isa<fir::BoxType>(toTy))) &&
         !(fir::isUnlimitedPolymorphicType(fromTy) && fir::isAssumedType(toTy)))
       return create<fir::ReboxOp>(loc, toTy, val, mlir::Value{},
                                   /*slice=*/mlir::Value{});
@@ -581,7 +582,7 @@ mlir::Value fir::FirOpBuilder::createBox(mlir::Location loc,
                                          bool isPolymorphic,
                                          bool isAssumedType) {
   mlir::Value itemAddr = fir::getBase(exv);
-  if (itemAddr.getType().isa<fir::BaseBoxType>())
+  if (mlir::isa<fir::BaseBoxType>(itemAddr.getType()))
     return itemAddr;
   auto elementType = fir::dyn_cast_ptrEleTy(itemAddr.getType());
   if (!elementType) {
@@ -592,7 +593,7 @@ mlir::Value fir::FirOpBuilder::createBox(mlir::Location loc,
   mlir::Type boxTy;
   mlir::Value tdesc;
   // Avoid to wrap a box/class with box/class.
-  if (elementType.isa<fir::BaseBoxType>()) {
+  if (mlir::isa<fir::BaseBoxType>(elementType)) {
     boxTy = elementType;
   } else {
     boxTy = fir::BoxType::get(elementType);
@@ -709,7 +710,7 @@ mlir::Value fir::FirOpBuilder::genAbsentOp(mlir::Location loc,
     return create<fir::AbsentOp>(loc, argTy);
 
   auto boxProc =
-      create<fir::AbsentOp>(loc, argTy.cast<mlir::TupleType>().getType(0));
+      create<fir::AbsentOp>(loc, mlir::cast<mlir::TupleType>(argTy).getType(0));
   mlir::Value charLen = create<fir::UndefOp>(loc, getCharacterLengthType());
   return fir::factory::createCharacterProcedureTuple(*this, loc, argTy, boxProc,
                                                      charLen);
@@ -958,14 +959,14 @@ static llvm::SmallVector<mlir::Value> getFromBox(mlir::Location loc,
                                                  fir::FirOpBuilder &builder,
                                                  mlir::Type valTy,
                                                  mlir::Value boxVal) {
-  if (auto boxTy = valTy.dyn_cast<fir::BaseBoxType>()) {
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(valTy)) {
     auto eleTy = fir::unwrapAllRefAndSeqType(boxTy.getEleTy());
-    if (auto recTy = eleTy.dyn_cast<fir::RecordType>()) {
+    if (auto recTy = mlir::dyn_cast<fir::RecordType>(eleTy)) {
       if (recTy.getNumLenParams() > 0) {
         // Walk each type parameter in the record and get the value.
         TODO(loc, "generate code to get LEN type parameters");
       }
-    } else if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+    } else if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
       if (charTy.hasDynamicLen()) {
         auto idxTy = builder.getIndexType();
         auto eleSz = builder.create<fir::BoxEleSizeOp>(loc, idxTy, boxVal);
@@ -1012,7 +1013,7 @@ llvm::SmallVector<mlir::Value>
 fir::factory::getTypeParams(mlir::Location loc, fir::FirOpBuilder &builder,
                             fir::ArrayLoadOp load) {
   mlir::Type memTy = load.getMemref().getType();
-  if (auto boxTy = memTy.dyn_cast<fir::BaseBoxType>())
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(memTy))
     return getFromBox(loc, builder, boxTy, load.getMemref());
   return load.getTypeparams();
 }
@@ -1039,7 +1040,7 @@ std::string fir::factory::uniqueCGIdent(llvm::StringRef prefix,
 
 mlir::Value fir::factory::locationToFilename(fir::FirOpBuilder &builder,
                                              mlir::Location loc) {
-  if (auto flc = loc.dyn_cast<mlir::FileLineColLoc>()) {
+  if (auto flc = mlir::dyn_cast<mlir::FileLineColLoc>(loc)) {
     // must be encoded as asciiz, C string
     auto fn = flc.getFilename().str() + '\0';
     return fir::getBase(createStringLiteral(builder, loc, fn));
@@ -1050,7 +1051,7 @@ mlir::Value fir::factory::locationToFilename(fir::FirOpBuilder &builder,
 mlir::Value fir::factory::locationToLineNo(fir::FirOpBuilder &builder,
                                            mlir::Location loc,
                                            mlir::Type type) {
-  if (auto flc = loc.dyn_cast<mlir::FileLineColLoc>())
+  if (auto flc = mlir::dyn_cast<mlir::FileLineColLoc>(loc))
     return builder.createIntegerConstant(loc, type, flc.getLine());
   return builder.createIntegerConstant(loc, type, 0);
 }
@@ -1108,10 +1109,10 @@ fir::ExtendedValue fir::factory::componentToExtendedValue(
   auto fieldTy = component.getType();
   if (auto ty = fir::dyn_cast_ptrEleTy(fieldTy))
     fieldTy = ty;
-  if (fieldTy.isa<fir::BaseBoxType>()) {
+  if (mlir::isa<fir::BaseBoxType>(fieldTy)) {
     llvm::SmallVector<mlir::Value> nonDeferredTypeParams;
     auto eleTy = fir::unwrapSequenceType(fir::dyn_cast_ptrOrBoxEleTy(fieldTy));
-    if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+    if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
       auto lenTy = builder.getCharacterLengthType();
       if (charTy.hasConstantLen())
         nonDeferredTypeParams.emplace_back(
@@ -1120,7 +1121,7 @@ fir::ExtendedValue fir::factory::componentToExtendedValue(
       // on a PDT length parameter. There is no way to make a difference with
       // deferred length here yet.
     }
-    if (auto recTy = eleTy.dyn_cast<fir::RecordType>())
+    if (auto recTy = mlir::dyn_cast<fir::RecordType>(eleTy))
       if (recTy.getNumLenParams() > 0)
         TODO(loc, "allocatable and pointer components non deferred length "
                   "parameters");
@@ -1129,7 +1130,7 @@ fir::ExtendedValue fir::factory::componentToExtendedValue(
                                 /*mutableProperties=*/{});
   }
   llvm::SmallVector<mlir::Value> extents;
-  if (auto seqTy = fieldTy.dyn_cast<fir::SequenceType>()) {
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(fieldTy)) {
     fieldTy = seqTy.getEleTy();
     auto idxTy = builder.getIndexType();
     for (auto extent : seqTy.getShape()) {
@@ -1138,7 +1139,7 @@ fir::ExtendedValue fir::factory::componentToExtendedValue(
       extents.emplace_back(builder.createIntegerConstant(loc, idxTy, extent));
     }
   }
-  if (auto charTy = fieldTy.dyn_cast<fir::CharacterType>()) {
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(fieldTy)) {
     auto cstLen = charTy.getLen();
     if (cstLen == fir::CharacterType::unknownLen())
       TODO(loc, "get character component length from length type parameters");
@@ -1148,7 +1149,7 @@ fir::ExtendedValue fir::factory::componentToExtendedValue(
       return fir::CharArrayBoxValue{component, len, extents};
     return fir::CharBoxValue{component, len};
   }
-  if (auto recordTy = fieldTy.dyn_cast<fir::RecordType>())
+  if (auto recordTy = mlir::dyn_cast<fir::RecordType>(fieldTy))
     if (recordTy.getNumLenParams() != 0)
       TODO(loc,
            "lower component ref that is a derived type with length parameter");
@@ -1211,14 +1212,14 @@ void fir::factory::genScalarAssignment(fir::FirOpBuilder &builder,
   assert(lhs.rank() == 0 && rhs.rank() == 0 && "must be scalars");
   auto type = fir::unwrapSequenceType(
       fir::unwrapPassByRefType(fir::getBase(lhs).getType()));
-  if (type.isa<fir::CharacterType>()) {
+  if (mlir::isa<fir::CharacterType>(type)) {
     const fir::CharBoxValue *toChar = lhs.getCharBox();
     const fir::CharBoxValue *fromChar = rhs.getCharBox();
     assert(toChar && fromChar);
     fir::factory::CharacterExprHelper helper{builder, loc};
     helper.createAssign(fir::ExtendedValue{*toChar},
                         fir::ExtendedValue{*fromChar});
-  } else if (type.isa<fir::RecordType>()) {
+  } else if (mlir::isa<fir::RecordType>(type)) {
     fir::factory::genRecordAssignment(builder, loc, lhs, rhs, needFinalization,
                                       isTemporaryLHS);
   } else {
@@ -1239,10 +1240,10 @@ static void genComponentByComponentAssignment(fir::FirOpBuilder &builder,
                                               const fir::ExtendedValue &rhs,
                                               bool isTemporaryLHS) {
   auto lbaseType = fir::unwrapPassByRefType(fir::getBase(lhs).getType());
-  auto lhsType = lbaseType.dyn_cast<fir::RecordType>();
+  auto lhsType = mlir::dyn_cast<fir::RecordType>(lbaseType);
   assert(lhsType && "lhs must be a scalar record type");
   auto rbaseType = fir::unwrapPassByRefType(fir::getBase(rhs).getType());
-  auto rhsType = rbaseType.dyn_cast<fir::RecordType>();
+  auto rhsType = mlir::dyn_cast<fir::RecordType>(rbaseType);
   assert(rhsType && "rhs must be a scalar record type");
   auto fieldIndexType = fir::FieldType::get(lhsType.getContext());
   for (auto [lhsPair, rhsPair] :
@@ -1261,7 +1262,7 @@ static void genComponentByComponentAssignment(fir::FirOpBuilder &builder,
     mlir::Value toCoor = builder.create<fir::CoordinateOp>(
         loc, fieldRefType, fir::getBase(lhs), field);
     std::optional<fir::DoLoopOp> outerLoop;
-    if (auto sequenceType = lFieldTy.dyn_cast<fir::SequenceType>()) {
+    if (auto sequenceType = mlir::dyn_cast<fir::SequenceType>(lFieldTy)) {
       // Create loops to assign array components elements by elements.
       // Note that, since these are components, they either do not overlap,
       // or are the same and exactly overlap. They also have compile time
@@ -1288,10 +1289,9 @@ static void genComponentByComponentAssignment(fir::FirOpBuilder &builder,
                                                    fromCoor, indices);
     }
     if (auto fieldEleTy = fir::unwrapSequenceType(lFieldTy);
-        fieldEleTy.isa<fir::BaseBoxType>()) {
-      assert(fieldEleTy.cast<fir::BaseBoxType>()
-                 .getEleTy()
-                 .isa<fir::PointerType>() &&
+        mlir::isa<fir::BaseBoxType>(fieldEleTy)) {
+      assert(mlir::isa<fir::PointerType>(
+                 mlir::cast<fir::BaseBoxType>(fieldEleTy).getEleTy()) &&
              "allocatable members require deep copy");
       auto fromPointerValue = builder.create<fir::LoadOp>(loc, fromCoor);
       auto castTo = builder.createConvert(loc, fieldEleTy, fromPointerValue);
@@ -1320,11 +1320,11 @@ static bool recordTypeCanBeMemCopied(fir::RecordType recordType) {
   for (auto [_, fieldType] : recordType.getTypeList()) {
     // Derived type component may have user assignment (so far, we cannot tell
     // in FIR, so assume it is always the case, TODO: get the actual info).
-    if (fir::unwrapSequenceType(fieldType).isa<fir::RecordType>())
+    if (mlir::isa<fir::RecordType>(fir::unwrapSequenceType(fieldType)))
       return false;
     // Allocatable components need deep copy.
-    if (auto boxType = fieldType.dyn_cast<fir::BaseBoxType>())
-      if (boxType.getEleTy().isa<fir::HeapType>())
+    if (auto boxType = mlir::dyn_cast<fir::BaseBoxType>(fieldType))
+      if (mlir::isa<fir::HeapType>(boxType.getEleTy()))
         return false;
   }
   // Constant size components without user defined assignment and pointers can
@@ -1353,9 +1353,10 @@ void fir::factory::genRecordAssignment(fir::FirOpBuilder &builder,
   // Box operands may be polymorphic, it is not entirely clear from 10.2.1.3
   // if the assignment is performed on the dynamic of declared type. Use the
   // runtime assuming it is performed on the dynamic type.
-  bool hasBoxOperands = fir::getBase(lhs).getType().isa<fir::BaseBoxType>() ||
-                        fir::getBase(rhs).getType().isa<fir::BaseBoxType>();
-  auto recTy = baseTy.dyn_cast<fir::RecordType>();
+  bool hasBoxOperands =
+      mlir::isa<fir::BaseBoxType>(fir::getBase(lhs).getType()) ||
+      mlir::isa<fir::BaseBoxType>(fir::getBase(rhs).getType());
+  auto recTy = mlir::dyn_cast<fir::RecordType>(baseTy);
   assert(recTy && "must be a record type");
   if ((needFinalization && mayHaveFinalizer(recTy, builder)) ||
       hasBoxOperands || !recordTypeCanBeMemCopied(recTy)) {
@@ -1401,7 +1402,7 @@ mlir::Value fir::factory::genLenOfCharacter(
     llvm::ArrayRef<mlir::Value> path, llvm::ArrayRef<mlir::Value> substring) {
   llvm::SmallVector<mlir::Value> typeParams(arrLoad.getTypeparams());
   return genLenOfCharacter(builder, loc,
-                           arrLoad.getType().cast<fir::SequenceType>(),
+                           mlir::cast<fir::SequenceType>(arrLoad.getType()),
                            arrLoad.getMemref(), typeParams, path, substring);
 }
 
@@ -1429,7 +1430,7 @@ mlir::Value fir::factory::genLenOfCharacter(
     lower = builder.createConvert(loc, idxTy, substring.front());
   auto eleTy = fir::applyPathToType(seqTy, path);
   if (!fir::hasDynamicSize(eleTy)) {
-    if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+    if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
       // Use LEN from the type.
       return builder.createIntegerConstant(loc, idxTy, charTy.getLen());
     }
@@ -1438,9 +1439,9 @@ mlir::Value fir::factory::genLenOfCharacter(
                         "application of path did not result in a !fir.char");
   }
   if (fir::isa_box_type(memref.getType())) {
-    if (memref.getType().isa<fir::BoxCharType>())
+    if (mlir::isa<fir::BoxCharType>(memref.getType()))
       return builder.create<fir::BoxCharLenOp>(loc, idxTy, memref);
-    if (memref.getType().isa<fir::BoxType>())
+    if (mlir::isa<fir::BoxType>(memref.getType()))
       return CharacterExprHelper(builder, loc).readLengthFromBox(memref);
     fir::emitFatalError(loc, "memref has wrong type");
   }
@@ -1457,7 +1458,7 @@ mlir::Value fir::factory::genLenOfCharacter(
 mlir::Value fir::factory::createZeroValue(fir::FirOpBuilder &builder,
                                           mlir::Location loc, mlir::Type type) {
   mlir::Type i1 = builder.getIntegerType(1);
-  if (type.isa<fir::LogicalType>() || type == i1)
+  if (mlir::isa<fir::LogicalType>(type) || type == i1)
     return builder.createConvert(loc, type, builder.createBool(loc, false));
   if (fir::isa_integer(type))
     return builder.createIntegerConstant(loc, type, 0);
@@ -1507,7 +1508,7 @@ mlir::Value fir::factory::genMaxWithZero(fir::FirOpBuilder &builder,
   mlir::Value zero = builder.createIntegerConstant(loc, value.getType(), 0);
   if (mlir::Operation *definingOp = value.getDefiningOp())
     if (auto cst = mlir::dyn_cast<mlir::arith::ConstantOp>(definingOp))
-      if (auto intAttr = cst.getValue().dyn_cast<mlir::IntegerAttr>())
+      if (auto intAttr = mlir::dyn_cast<mlir::IntegerAttr>(cst.getValue()))
         return intAttr.getInt() > 0 ? value : zero;
   mlir::Value valueIsGreater = builder.create<mlir::arith::CmpIOp>(
       loc, mlir::arith::CmpIPredicate::sgt, value, zero);
@@ -1519,8 +1520,8 @@ mlir::Value fir::factory::genCPtrOrCFunptrAddr(fir::FirOpBuilder &builder,
                                                mlir::Location loc,
                                                mlir::Value cPtr,
                                                mlir::Type ty) {
-  assert(ty.isa<fir::RecordType>());
-  auto recTy = ty.dyn_cast<fir::RecordType>();
+  assert(mlir::isa<fir::RecordType>(ty));
+  auto recTy = mlir::dyn_cast<fir::RecordType>(ty);
   assert(recTy.getTypeList().size() == 1);
   auto fieldName = recTy.getTypeList()[0].first;
   mlir::Type fieldTy = recTy.getTypeList()[0].second;
@@ -1582,7 +1583,7 @@ mlir::Value fir::factory::genCPtrOrCFunptrValue(fir::FirOpBuilder &builder,
 mlir::Value fir::factory::createNullBoxProc(fir::FirOpBuilder &builder,
                                             mlir::Location loc,
                                             mlir::Type boxType) {
-  auto boxTy{boxType.dyn_cast<fir::BoxProcType>()};
+  auto boxTy{mlir::dyn_cast<fir::BoxProcType>(boxType)};
   if (!boxTy)
     fir::emitFatalError(loc, "Procedure pointer must be of BoxProcType");
   auto boxEleTy{fir::unwrapRefType(boxTy.getEleTy())};
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index db638ceb4070..44779427ab55 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -38,10 +38,10 @@ hlfir::getExplicitExtentsFromShape(mlir::Value shape,
   } else if (mlir::dyn_cast_or_null<fir::ShiftOp>(shapeOp)) {
     return {};
   } else if (auto s = mlir::dyn_cast_or_null<hlfir::ShapeOfOp>(shapeOp)) {
-    hlfir::ExprType expr = s.getExpr().getType().cast<hlfir::ExprType>();
+    hlfir::ExprType expr = mlir::cast<hlfir::ExprType>(s.getExpr().getType());
     llvm::ArrayRef<int64_t> exprShape = expr.getShape();
     mlir::Type indexTy = builder.getIndexType();
-    fir::ShapeType shapeTy = shape.getType().cast<fir::ShapeType>();
+    fir::ShapeType shapeTy = mlir::cast<fir::ShapeType>(shape.getType());
     result.reserve(shapeTy.getRank());
     for (unsigned i = 0; i < shapeTy.getRank(); ++i) {
       int64_t extent = exprShape[i];
@@ -99,7 +99,7 @@ genLboundsAndExtentsFromBox(mlir::Location loc, fir::FirOpBuilder &builder,
                             hlfir::Entity boxEntity,
                             llvm::SmallVectorImpl<mlir::Value> &lbounds,
                             llvm::SmallVectorImpl<mlir::Value> *extents) {
-  assert(boxEntity.getType().isa<fir::BaseBoxType>() && "must be a box");
+  assert(mlir::isa<fir::BaseBoxType>(boxEntity.getType()) && "must be a box");
   mlir::Type idxTy = builder.getIndexType();
   const int rank = boxEntity.getRank();
   for (int i = 0; i < rank; ++i) {
@@ -154,7 +154,7 @@ static mlir::Value genCharacterVariableLength(mlir::Location loc,
                                               hlfir::Entity var) {
   if (mlir::Value len = tryGettingNonDeferredCharLen(var))
     return len;
-  auto charType = var.getFortranElementType().cast<fir::CharacterType>();
+  auto charType = mlir::cast<fir::CharacterType>(var.getFortranElementType());
   if (charType.hasConstantLen())
     return builder.createIntegerConstant(loc, builder.getIndexType(),
                                          charType.getLen());
@@ -172,7 +172,7 @@ static fir::CharBoxValue genUnboxChar(mlir::Location loc,
   if (auto emboxChar = boxChar.getDefiningOp<fir::EmboxCharOp>())
     return {emboxChar.getMemref(), emboxChar.getLen()};
   mlir::Type refType = fir::ReferenceType::get(
-      boxChar.getType().cast<fir::BoxCharType>().getEleTy());
+      mlir::cast<fir::BoxCharType>(boxChar.getType()).getEleTy());
   auto unboxed = builder.create<fir::UnboxCharOp>(
       loc, refType, builder.getIndexType(), boxChar);
   mlir::Value addr = unboxed.getResult(0);
@@ -252,8 +252,8 @@ hlfir::genAssociateExpr(mlir::Location loc, fir::FirOpBuilder &builder,
   // and the other static).
   mlir::Type varEleTy = getFortranElementType(variableType);
   mlir::Type valueEleTy = getFortranElementType(value.getType());
-  if (varEleTy != valueEleTy && !(valueEleTy.isa<fir::CharacterType>() &&
-                                  varEleTy.isa<fir::CharacterType>())) {
+  if (varEleTy != valueEleTy && !(mlir::isa<fir::CharacterType>(valueEleTy) &&
+                                  mlir::isa<fir::CharacterType>(varEleTy))) {
     assert(value.isScalar() && fir::isa_trivial(value.getType()));
     source = builder.createConvert(loc, fir::unwrapPassByRefType(variableType),
                                    value);
@@ -278,9 +278,9 @@ mlir::Value hlfir::genVariableRawAddress(mlir::Location loc,
   if (var.isMutableBox())
     baseAddr = builder.create<fir::LoadOp>(loc, baseAddr);
   // Get raw address.
-  if (var.getType().isa<fir::BoxCharType>())
+  if (mlir::isa<fir::BoxCharType>(var.getType()))
     baseAddr = genUnboxChar(loc, builder, var.getBase()).getAddr();
-  if (baseAddr.getType().isa<fir::BaseBoxType>())
+  if (mlir::isa<fir::BaseBoxType>(baseAddr.getType()))
     baseAddr = builder.create<fir::BoxAddrOp>(loc, baseAddr);
   return baseAddr;
 }
@@ -289,13 +289,13 @@ mlir::Value hlfir::genVariableBoxChar(mlir::Location loc,
                                       fir::FirOpBuilder &builder,
                                       hlfir::Entity var) {
   assert(var.isVariable() && "only address of variables can be taken");
-  if (var.getType().isa<fir::BoxCharType>())
+  if (mlir::isa<fir::BoxCharType>(var.getType()))
     return var;
   mlir::Value addr = genVariableRawAddress(loc, builder, var);
   llvm::SmallVector<mlir::Value> lengths;
   genLengthParameters(loc, builder, var, lengths);
   assert(lengths.size() == 1);
-  auto charType = var.getFortranElementType().cast<fir::CharacterType>();
+  auto charType = mlir::cast<fir::CharacterType>(var.getFortranElementType());
   auto boxCharType =
       fir::BoxCharType::get(builder.getContext(), charType.getFKind());
   auto scalarAddr =
@@ -309,7 +309,7 @@ hlfir::Entity hlfir::genVariableBox(mlir::Location loc,
                                     hlfir::Entity var) {
   assert(var.isVariable() && "must be a variable");
   var = hlfir::derefPointersAndAllocatables(loc, builder, var);
-  if (var.getType().isa<fir::BaseBoxType>())
+  if (mlir::isa<fir::BaseBoxType>(var.getType()))
     return var;
   // Note: if the var is not a fir.box/fir.class at that point, it has default
   // lower bounds and is not polymorphic.
@@ -317,11 +317,11 @@ hlfir::Entity hlfir::genVariableBox(mlir::Location loc,
       var.isArray() ? hlfir::genShape(loc, builder, var) : mlir::Value{};
   llvm::SmallVector<mlir::Value> typeParams;
   auto maybeCharType =
-      var.getFortranElementType().dyn_cast<fir::CharacterType>();
+      mlir::dyn_cast<fir::CharacterType>(var.getFortranElementType());
   if (!maybeCharType || maybeCharType.hasDynamicLen())
     hlfir::genLengthParameters(loc, builder, var, typeParams);
   mlir::Value addr = var.getBase();
-  if (var.getType().isa<fir::BoxCharType>())
+  if (mlir::isa<fir::BoxCharType>(var.getType()))
     addr = genVariableRawAddress(loc, builder, var);
   mlir::Type boxType = fir::BoxType::get(var.getElementOrSequenceType());
   auto embox =
@@ -348,7 +348,7 @@ hlfir::Entity hlfir::getElementAt(mlir::Location loc,
     return entity;
   llvm::SmallVector<mlir::Value> lenParams;
   genLengthParameters(loc, builder, entity, lenParams);
-  if (entity.getType().isa<hlfir::ExprType>())
+  if (mlir::isa<hlfir::ExprType>(entity.getType()))
     return hlfir::Entity{builder.create<hlfir::ApplyOp>(
         loc, entity, oneBasedIndices, lenParams)};
   // Build hlfir.designate. The lower bounds may need to be added to
@@ -394,7 +394,7 @@ static mlir::Value genUBound(mlir::Location loc, fir::FirOpBuilder &builder,
 llvm::SmallVector<std::pair<mlir::Value, mlir::Value>>
 hlfir::genBounds(mlir::Location loc, fir::FirOpBuilder &builder,
                  Entity entity) {
-  if (entity.getType().isa<hlfir::ExprType>())
+  if (mlir::isa<hlfir::ExprType>(entity.getType()))
     TODO(loc, "bounds of expressions in hlfir");
   auto [exv, cleanup] = translateToExtendedValue(loc, builder, entity);
   assert(!cleanup && "translation of entity should not yield cleanup");
@@ -415,8 +415,8 @@ hlfir::genBounds(mlir::Location loc, fir::FirOpBuilder &builder,
 llvm::SmallVector<std::pair<mlir::Value, mlir::Value>>
 hlfir::genBounds(mlir::Location loc, fir::FirOpBuilder &builder,
                  mlir::Value shape) {
-  assert((shape.getType().isa<fir::ShapeShiftType>() ||
-          shape.getType().isa<fir::ShapeType>()) &&
+  assert((mlir::isa<fir::ShapeShiftType>(shape.getType()) ||
+          mlir::isa<fir::ShapeType>(shape.getType())) &&
          "shape must contain extents");
   auto extents = hlfir::getExplicitExtentsFromShape(shape, builder);
   auto lowers = getExplicitLboundsFromShape(shape);
@@ -474,7 +474,7 @@ static mlir::Value computeVariableExtent(mlir::Location loc,
     if (typeExtent != fir::SequenceType::getUnknownExtent())
       return builder.createIntegerConstant(loc, idxTy, typeExtent);
   }
-  assert(variable.getType().isa<fir::BaseBoxType>() &&
+  assert(mlir::isa<fir::BaseBoxType>(variable.getType()) &&
          "array variable with dynamic extent must be boxed");
   mlir::Value dimVal = builder.createIntegerConstant(loc, idxTy, dim);
   auto dimInfo = builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy,
@@ -496,9 +496,8 @@ llvm::SmallVector<mlir::Value> getVariableExtents(mlir::Location loc,
     variable = hlfir::derefPointersAndAllocatables(loc, builder, variable);
   // Use the type shape information, and/or the fir.box/fir.class shape
   // information if any extents are not static.
-  fir::SequenceType seqTy =
-      hlfir::getFortranElementOrSequenceType(variable.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType seqTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(variable.getType()));
   unsigned rank = seqTy.getShape().size();
   for (unsigned dim = 0; dim < rank; ++dim)
     extents.push_back(
@@ -507,7 +506,7 @@ llvm::SmallVector<mlir::Value> getVariableExtents(mlir::Location loc,
 }
 
 static mlir::Value tryRetrievingShapeOrShift(hlfir::Entity entity) {
-  if (entity.getType().isa<hlfir::ExprType>()) {
+  if (mlir::isa<hlfir::ExprType>(entity.getType())) {
     if (auto elemental = entity.getDefiningOp<hlfir::ElementalOp>())
       return elemental.getShape();
     return mlir::Value{};
@@ -523,13 +522,13 @@ mlir::Value hlfir::genShape(mlir::Location loc, fir::FirOpBuilder &builder,
   entity = followShapeInducingSource(entity);
   assert(entity && "what?");
   if (auto shape = tryRetrievingShapeOrShift(entity)) {
-    if (shape.getType().isa<fir::ShapeType>())
+    if (mlir::isa<fir::ShapeType>(shape.getType()))
       return shape;
-    if (shape.getType().isa<fir::ShapeShiftType>())
+    if (mlir::isa<fir::ShapeShiftType>(shape.getType()))
       if (auto s = shape.getDefiningOp<fir::ShapeShiftOp>())
         return builder.create<fir::ShapeOp>(loc, s.getExtents());
   }
-  if (entity.getType().isa<hlfir::ExprType>())
+  if (mlir::isa<hlfir::ExprType>(entity.getType()))
     return builder.create<hlfir::ShapeOfOp>(loc, entity.getBase());
   // There is no shape lying around for this entity. Retrieve the extents and
   // build a new fir.shape.
@@ -563,9 +562,8 @@ mlir::Value hlfir::genExtent(mlir::Location loc, fir::FirOpBuilder &builder,
       entity = hlfir::derefPointersAndAllocatables(loc, builder, entity);
     // Use the type shape information, and/or the fir.box/fir.class shape
     // information if any extents are not static.
-    fir::SequenceType seqTy =
-        hlfir::getFortranElementOrSequenceType(entity.getType())
-            .cast<fir::SequenceType>();
+    fir::SequenceType seqTy = mlir::cast<fir::SequenceType>(
+        hlfir::getFortranElementOrSequenceType(entity.getType()));
     return computeVariableExtent(loc, builder, entity, seqTy, dim);
   }
   TODO(loc, "get extent from HLFIR expr without producer holding the shape");
@@ -584,7 +582,7 @@ mlir::Value hlfir::genLBound(mlir::Location loc, fir::FirOpBuilder &builder,
   }
   if (entity.isMutableBox())
     entity = hlfir::derefPointersAndAllocatables(loc, builder, entity);
-  assert(entity.getType().isa<fir::BaseBoxType>() && "must be a box");
+  assert(mlir::isa<fir::BaseBoxType>(entity.getType()) && "must be a box");
   mlir::Type idxTy = builder.getIndexType();
   mlir::Value dimVal = builder.createIntegerConstant(loc, idxTy, dim);
   auto dimInfo =
@@ -597,7 +595,7 @@ void hlfir::genLengthParameters(mlir::Location loc, fir::FirOpBuilder &builder,
                                 llvm::SmallVectorImpl<mlir::Value> &result) {
   if (!entity.hasLengthParameters())
     return;
-  if (entity.getType().isa<hlfir::ExprType>()) {
+  if (mlir::isa<hlfir::ExprType>(entity.getType())) {
     mlir::Value expr = entity;
     if (auto reassoc = expr.getDefiningOp<hlfir::NoReassocOp>())
       expr = reassoc.getVal();
@@ -654,8 +652,8 @@ static mlir::Value asEmboxShape(mlir::Location loc, fir::FirOpBuilder &builder,
   // fir.shape_shift) since this information is already in the input fir.box,
   // it only accepts fir.shift because local lower bounds may not be reflected
   // in the fir.box.
-  if (fir::getBase(exv).getType().isa<fir::BaseBoxType>() &&
-      !shape.getType().isa<fir::ShiftType>())
+  if (mlir::isa<fir::BaseBoxType>(fir::getBase(exv).getType()) &&
+      !mlir::isa<fir::ShiftType>(shape.getType()))
     return builder.createShape(loc, exv);
   return shape;
 }
@@ -686,7 +684,7 @@ hlfir::Entity hlfir::derefPointersAndAllocatables(mlir::Location loc,
       if (!entity.isPolymorphic() && !entity.hasLengthParameters())
         return hlfir::Entity{builder.create<fir::BoxAddrOp>(loc, boxLoad)};
       mlir::Type elementType = boxLoad.getFortranElementType();
-      if (auto charType = elementType.dyn_cast<fir::CharacterType>()) {
+      if (auto charType = mlir::dyn_cast<fir::CharacterType>(elementType)) {
         mlir::Value base = builder.create<fir::BoxAddrOp>(loc, boxLoad);
         if (charType.hasConstantLen())
           return hlfir::Entity{base};
@@ -716,7 +714,7 @@ mlir::Type hlfir::getVariableElementType(hlfir::Entity variable) {
   mlir::Type eleTy = variable.getFortranElementType();
   if (variable.isPolymorphic())
     return fir::ClassType::get(eleTy);
-  if (auto charType = eleTy.dyn_cast<fir::CharacterType>()) {
+  if (auto charType = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
     if (charType.hasDynamicLen())
       return fir::BoxCharType::get(charType.getContext(), charType.getFKind());
   } else if (fir::isRecordWithTypeParameters(eleTy)) {
@@ -737,7 +735,7 @@ mlir::Type hlfir::getEntityElementType(hlfir::Entity entity) {
 
 static hlfir::ExprType getArrayExprType(mlir::Type elementType,
                                         mlir::Value shape, bool isPolymorphic) {
-  unsigned rank = shape.getType().cast<fir::ShapeType>().getRank();
+  unsigned rank = mlir::cast<fir::ShapeType>(shape.getType()).getRank();
   hlfir::ExprType::Shape typeShape(rank, hlfir::ExprType::getUnknownExtent());
   if (auto shapeOp = shape.getDefiningOp<fir::ShapeOp>())
     for (auto extent : llvm::enumerate(shapeOp.getExtents()))
@@ -859,7 +857,7 @@ translateVariableToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
     return fir::MutableBoxValue(base, getExplicitTypeParams(variable),
                                 fir::MutableProperties{});
 
-  if (base.getType().isa<fir::BaseBoxType>()) {
+  if (mlir::isa<fir::BaseBoxType>(base.getType())) {
     if (!variable.isSimplyContiguous() || variable.isPolymorphic() ||
         variable.isDerivedWithLengthParameters() || variable.isOptional()) {
       llvm::SmallVector<mlir::Value> nonDefaultLbounds =
@@ -874,7 +872,7 @@ translateVariableToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
 
   if (variable.isScalar()) {
     if (variable.isCharacter()) {
-      if (base.getType().isa<fir::BoxCharType>())
+      if (mlir::isa<fir::BoxCharType>(base.getType()))
         return genUnboxChar(loc, builder, base);
       mlir::Value len = genCharacterVariableLength(loc, builder, variable);
       return fir::CharBoxValue{base, len};
@@ -883,7 +881,7 @@ translateVariableToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
   }
   llvm::SmallVector<mlir::Value> extents;
   llvm::SmallVector<mlir::Value> nonDefaultLbounds;
-  if (variable.getType().isa<fir::BaseBoxType>() &&
+  if (mlir::isa<fir::BaseBoxType>(variable.getType()) &&
       !variable.getIfVariableInterface()) {
     // This special case avoids generating two sets of identical
     // fir.box_dim to get both the lower bounds and extents.
@@ -923,7 +921,7 @@ hlfir::translateToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
     return {static_cast<mlir::Value>(entity), std::nullopt};
   }
 
-  if (entity.getType().isa<hlfir::ExprType>()) {
+  if (mlir::isa<hlfir::ExprType>(entity.getType())) {
     mlir::NamedAttribute byRefAttr = fir::getAdaptToByRefAttr(builder);
     hlfir::AssociateOp associate = hlfir::genAssociateExpr(
         loc, builder, entity, entity.getType(), "", byRefAttr);
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index e28d14cd318d..9d72e76e2369 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -786,7 +786,7 @@ mlir::Value genLibSplitComplexArgsCall(fir::FirOpBuilder &builder,
 
   auto getSplitComplexArgsType = [&builder, &args]() -> mlir::FunctionType {
     mlir::Type ctype = args[0].getType();
-    auto fKind = ctype.cast<fir::ComplexType>().getFKind();
+    auto fKind = mlir::cast<fir::ComplexType>(ctype).getFKind();
     mlir::Type ftype;
 
     if (fKind == 2)
@@ -894,8 +894,8 @@ mlir::Value genComplexMathOp(fir::FirOpBuilder &builder, mlir::Location loc,
   LLVM_DEBUG(llvm::dbgs() << "Generating '" << mathLibFuncName
                           << "' operation with type ";
              mathLibFuncType.dump(); llvm::dbgs() << "\n");
-  auto type = mathLibFuncType.getInput(0).cast<fir::ComplexType>();
-  auto kind = type.getElementType().cast<fir::RealType>().getFKind();
+  auto type = mlir::cast<fir::ComplexType>(mathLibFuncType.getInput(0));
+  auto kind = mlir::cast<fir::RealType>(type.getElementType()).getFKind();
   auto realTy = builder.getRealType(kind);
   auto mComplexTy = mlir::ComplexType::get(realTy);
 
@@ -1394,14 +1394,14 @@ private:
 
   // Floating point can be mlir::FloatType or fir::real
   static unsigned getFloatingPointWidth(mlir::Type t) {
-    if (auto f{t.dyn_cast<mlir::FloatType>()})
+    if (auto f{mlir::dyn_cast<mlir::FloatType>(t)})
       return f.getWidth();
     // FIXME: Get width another way for fir.real/complex
     // - use fir/KindMapping.h and llvm::Type
     // - or use evaluate/type.h
-    if (auto r{t.dyn_cast<fir::RealType>()})
+    if (auto r{mlir::dyn_cast<fir::RealType>(t)})
       return r.getFKind() * 4;
-    if (auto cplx{t.dyn_cast<fir::ComplexType>()})
+    if (auto cplx{mlir::dyn_cast<fir::ComplexType>(t)})
       return cplx.getFKind() * 4;
     llvm_unreachable("not a floating-point type");
   }
@@ -1410,8 +1410,8 @@ private:
     if (from == to)
       return Conversion::None;
 
-    if (auto fromIntTy{from.dyn_cast<mlir::IntegerType>()}) {
-      if (auto toIntTy{to.dyn_cast<mlir::IntegerType>()}) {
+    if (auto fromIntTy{mlir::dyn_cast<mlir::IntegerType>(from)}) {
+      if (auto toIntTy{mlir::dyn_cast<mlir::IntegerType>(to)}) {
         return fromIntTy.getWidth() > toIntTy.getWidth() ? Conversion::Narrow
                                                          : Conversion::Extend;
       }
@@ -1423,8 +1423,8 @@ private:
                  : Conversion::Extend;
     }
 
-    if (auto fromCplxTy{from.dyn_cast<fir::ComplexType>()}) {
-      if (auto toCplxTy{to.dyn_cast<fir::ComplexType>()}) {
+    if (auto fromCplxTy{mlir::dyn_cast<fir::ComplexType>(from)}) {
+      if (auto toCplxTy{mlir::dyn_cast<fir::ComplexType>(to)}) {
         return getFloatingPointWidth(fromCplxTy) >
                        getFloatingPointWidth(toCplxTy)
                    ? Conversion::Narrow
@@ -1550,10 +1550,10 @@ fir::ExtendedValue toExtendedValue(mlir::Value val, fir::FirOpBuilder &builder,
   if (charHelper.isCharacterScalar(type))
     return charHelper.toExtendedValue(val);
 
-  if (auto refType = type.dyn_cast<fir::ReferenceType>())
+  if (auto refType = mlir::dyn_cast<fir::ReferenceType>(type))
     type = refType.getEleTy();
 
-  if (auto arrayType = type.dyn_cast<fir::SequenceType>()) {
+  if (auto arrayType = mlir::dyn_cast<fir::SequenceType>(type)) {
     type = arrayType.getEleTy();
     for (fir::SequenceType::Extent extent : arrayType.getShape()) {
       if (extent == fir::SequenceType::getUnknownExtent())
@@ -1566,7 +1566,8 @@ fir::ExtendedValue toExtendedValue(mlir::Value val, fir::FirOpBuilder &builder,
     // have been used in the interface).
     if (extents.size() + 1 < arrayType.getShape().size())
       mlir::emitError(loc, "cannot retrieve array extents from type");
-  } else if (type.isa<fir::BoxType>() || type.isa<fir::RecordType>()) {
+  } else if (mlir::isa<fir::BoxType>(type) ||
+             mlir::isa<fir::RecordType>(type)) {
     fir::emitFatalError(loc, "not yet implemented: descriptor or derived type");
   }
 
@@ -1580,10 +1581,10 @@ mlir::Value toValue(const fir::ExtendedValue &val, fir::FirOpBuilder &builder,
   if (const fir::CharBoxValue *charBox = val.getCharBox()) {
     mlir::Value buffer = charBox->getBuffer();
     auto buffTy = buffer.getType();
-    if (buffTy.isa<mlir::FunctionType>())
+    if (mlir::isa<mlir::FunctionType>(buffTy))
       fir::emitFatalError(
           loc, "A character's buffer type cannot be a function type.");
-    if (buffTy.isa<fir::BoxCharType>())
+    if (mlir::isa<fir::BoxCharType>(buffTy))
       return buffer;
     return fir::factory::CharacterExprHelper{builder, loc}.createEmboxChar(
         buffer, charBox->getLen());
@@ -1827,27 +1828,27 @@ IntrinsicLibrary::invokeGenerator(SubroutineGenerator generator,
 /// Note: mlir has Type::dump(ostream) methods but it may add "!" that is not
 /// suitable for function names.
 static std::string typeToString(mlir::Type t) {
-  if (auto refT{t.dyn_cast<fir::ReferenceType>()})
+  if (auto refT{mlir::dyn_cast<fir::ReferenceType>(t)})
     return "ref_" + typeToString(refT.getEleTy());
-  if (auto i{t.dyn_cast<mlir::IntegerType>()}) {
+  if (auto i{mlir::dyn_cast<mlir::IntegerType>(t)}) {
     return "i" + std::to_string(i.getWidth());
   }
-  if (auto cplx{t.dyn_cast<fir::ComplexType>()}) {
+  if (auto cplx{mlir::dyn_cast<fir::ComplexType>(t)}) {
     return "z" + std::to_string(cplx.getFKind());
   }
-  if (auto real{t.dyn_cast<fir::RealType>()}) {
+  if (auto real{mlir::dyn_cast<fir::RealType>(t)}) {
     return "r" + std::to_string(real.getFKind());
   }
-  if (auto f{t.dyn_cast<mlir::FloatType>()}) {
+  if (auto f{mlir::dyn_cast<mlir::FloatType>(t)}) {
     return "f" + std::to_string(f.getWidth());
   }
-  if (auto logical{t.dyn_cast<fir::LogicalType>()}) {
+  if (auto logical{mlir::dyn_cast<fir::LogicalType>(t)}) {
     return "l" + std::to_string(logical.getFKind());
   }
-  if (auto character{t.dyn_cast<fir::CharacterType>()}) {
+  if (auto character{mlir::dyn_cast<fir::CharacterType>(t)}) {
     return "c" + std::to_string(character.getFKind());
   }
-  if (auto boxCharacter{t.dyn_cast<fir::BoxCharType>()}) {
+  if (auto boxCharacter{mlir::dyn_cast<fir::BoxCharType>(t)}) {
     return "bc" + std::to_string(boxCharacter.getEleTy().getFKind());
   }
   llvm_unreachable("no mangling for type");
@@ -1907,7 +1908,7 @@ mlir::func::FuncOp IntrinsicLibrary::getWrapper(GeneratorType generator,
     mlir::Location localLoc = localBuilder->getUnknownLoc();
     llvm::SmallVector<mlir::Value> localArguments;
     for (mlir::BlockArgument bArg : function.front().getArguments()) {
-      auto refType = bArg.getType().dyn_cast<fir::ReferenceType>();
+      auto refType = mlir::dyn_cast<fir::ReferenceType>(bArg.getType());
       if (loadRefArguments && refType) {
         auto loaded = localBuilder->create<fir::LoadOp>(localLoc, bArg);
         localArguments.push_back(loaded);
@@ -2060,7 +2061,7 @@ mlir::SymbolRefAttr IntrinsicLibrary::getUnrestrictedIntrinsicSymbolRefAttr(
   if (!funcOp) {
     llvm::SmallVector<mlir::Type> argTypes;
     for (mlir::Type type : signature.getInputs()) {
-      if (auto refType = type.dyn_cast<fir::ReferenceType>())
+      if (auto refType = mlir::dyn_cast<fir::ReferenceType>(type))
         argTypes.push_back(refType.getEleTy());
       else
         argTypes.push_back(type);
@@ -2145,7 +2146,7 @@ mlir::Value IntrinsicLibrary::genAbs(mlir::Type resultType,
     // math::AbsFOp but it does not support all fir floating point types.
     return genRuntimeCall("abs", resultType, args);
   }
-  if (auto intType = type.dyn_cast<mlir::IntegerType>()) {
+  if (auto intType = mlir::dyn_cast<mlir::IntegerType>(type)) {
     // At the time of this implementation there is no abs op in mlir.
     // So, implement abs here without branching.
     mlir::Value shift =
@@ -2379,8 +2380,8 @@ IntrinsicLibrary::genAssociated(mlir::Type resultType,
                                 llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 2);
   mlir::Type ptrTy = fir::getBase(args[0]).getType();
-  if (ptrTy &&
-      (fir::isBoxProcAddressType(ptrTy) || ptrTy.isa<fir::BoxProcType>())) {
+  if (ptrTy && (fir::isBoxProcAddressType(ptrTy) ||
+                mlir::isa<fir::BoxProcType>(ptrTy))) {
     mlir::Value pointerBoxProc =
         fir::isBoxProcAddressType(ptrTy)
             ? builder.create<fir::LoadOp>(loc, fir::getBase(args[0]))
@@ -2392,7 +2393,7 @@ IntrinsicLibrary::genAssociated(mlir::Type resultType,
     mlir::Value target = fir::getBase(args[1]);
     if (fir::isBoxProcAddressType(target.getType()))
       target = builder.create<fir::LoadOp>(loc, target);
-    if (target.getType().isa<fir::BoxProcType>())
+    if (mlir::isa<fir::BoxProcType>(target.getType()))
       target = builder.create<fir::BoxAddrOp>(loc, target);
     mlir::Type intPtrTy = builder.getIntPtrType();
     mlir::Value pointerInt =
@@ -2649,7 +2650,7 @@ static mlir::Value getAddrFromBox(fir::FirOpBuilder &builder,
   mlir::Value argValue = fir::getBase(arg);
   mlir::Value addr{nullptr};
   if (isFunc) {
-    auto funcTy = argValue.getType().cast<fir::BoxProcType>().getEleTy();
+    auto funcTy = mlir::cast<fir::BoxProcType>(argValue.getType()).getEleTy();
     addr = builder.create<fir::BoxAddrOp>(loc, funcTy, argValue);
   } else {
     const auto *box = arg.getBoxOf<fir::BoxValue>();
@@ -3029,7 +3030,7 @@ void IntrinsicLibrary::genDateAndTime(llvm::ArrayRef<fir::ExtendedValue> args) {
 mlir::Value IntrinsicLibrary::genDim(mlir::Type resultType,
                                      llvm::ArrayRef<mlir::Value> args) {
   assert(args.size() == 2);
-  if (resultType.isa<mlir::IntegerType>()) {
+  if (mlir::isa<mlir::IntegerType>(resultType)) {
     mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
     auto diff = builder.create<mlir::arith::SubIOp>(loc, args[0], args[1]);
     auto cmp = builder.create<mlir::arith::CmpIOp>(
@@ -3574,7 +3575,7 @@ IntrinsicLibrary::genReduction(FN func, FD funcDim, llvm::StringRef errMsg,
   if (absentDim || rank == 1) {
     mlir::Type ty = array.getType();
     mlir::Type arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
-    auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+    auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
     if (fir::isa_complex(eleTy)) {
       mlir::Value result = builder.createTemporary(loc, eleTy);
       func(builder, loc, array, mask, result);
@@ -3646,7 +3647,7 @@ mlir::Value IntrinsicLibrary::genIbits(mlir::Type resultType,
   mlir::Value pos = builder.createConvert(loc, resultType, args[1]);
   mlir::Value len = builder.createConvert(loc, resultType, args[2]);
   mlir::Value bitSize = builder.createIntegerConstant(
-      loc, resultType, resultType.cast<mlir::IntegerType>().getWidth());
+      loc, resultType, mlir::cast<mlir::IntegerType>(resultType).getWidth());
   auto shiftCount = builder.create<mlir::arith::SubIOp>(loc, bitSize, len);
   mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
   mlir::Value ones = builder.createAllOnesInteger(loc, resultType);
@@ -3686,7 +3687,7 @@ IntrinsicLibrary::genIchar(mlir::Type resultType,
   mlir::Value buffer = charBox->getBuffer();
   mlir::Type bufferTy = buffer.getType();
   mlir::Value charVal;
-  if (auto charTy = bufferTy.dyn_cast<fir::CharacterType>()) {
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(bufferTy)) {
     assert(charTy.singleton());
     charVal = buffer;
   } else {
@@ -3759,7 +3760,7 @@ void IntrinsicLibrary::genRaiseExcept(int except, mlir::Value cond) {
 static std::pair<mlir::Value, mlir::Type>
 getFieldRef(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value rec) {
   auto recType =
-      fir::unwrapPassByRefType(rec.getType()).dyn_cast<fir::RecordType>();
+      mlir::dyn_cast<fir::RecordType>(fir::unwrapPassByRefType(rec.getType()));
   assert(recType.getTypeList().size() == 1 && "expected exactly one component");
   auto [fieldName, fieldTy] = recType.getTypeList().front();
   mlir::Value field = builder.create<fir::FieldIndexOp>(
@@ -3808,7 +3809,7 @@ mlir::Value IntrinsicLibrary::genIeeeClass(mlir::Type resultType,
 
   assert(args.size() == 1);
   mlir::Value realVal = args[0];
-  mlir::FloatType realType = realVal.getType().dyn_cast<mlir::FloatType>();
+  mlir::FloatType realType = mlir::dyn_cast<mlir::FloatType>(realVal.getType());
   const unsigned intWidth = realType.getWidth();
   mlir::Type intType = builder.getIntegerType(intWidth);
   mlir::Value intVal =
@@ -4056,8 +4057,10 @@ IntrinsicLibrary::genIeeeCopySign(mlir::Type resultType,
   assert(args.size() == 2);
   mlir::Value xRealVal = args[0];
   mlir::Value yRealVal = args[1];
-  mlir::FloatType xRealType = xRealVal.getType().dyn_cast<mlir::FloatType>();
-  mlir::FloatType yRealType = yRealVal.getType().dyn_cast<mlir::FloatType>();
+  mlir::FloatType xRealType =
+      mlir::dyn_cast<mlir::FloatType>(xRealVal.getType());
+  mlir::FloatType yRealType =
+      mlir::dyn_cast<mlir::FloatType>(yRealVal.getType());
 
   if (yRealType == mlir::FloatType::getBF16(builder.getContext())) {
     // Workaround: CopySignOp and BitcastOp don't work for kind 3 arg Y.
@@ -4106,7 +4109,7 @@ void IntrinsicLibrary::genIeeeGetFlag(llvm::ArrayRef<fir::ExtendedValue> args) {
   mlir::Value flag = fir::getBase(args[0]);
   mlir::Value flagValue = fir::getBase(args[1]);
   mlir::Type resultTy =
-      flagValue.getType().dyn_cast<fir::ReferenceType>().getEleTy();
+      mlir::dyn_cast<fir::ReferenceType>(flagValue.getType()).getEleTy();
   mlir::Type i32Ty = builder.getIntegerType(32);
   mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
   auto [fieldRef, ignore] = getFieldRef(builder, loc, flag);
@@ -4130,7 +4133,7 @@ void IntrinsicLibrary::genIeeeGetHaltingMode(
   mlir::Value flag = fir::getBase(args[0]);
   mlir::Value halting = fir::getBase(args[1]);
   mlir::Type resultTy =
-      halting.getType().dyn_cast<fir::ReferenceType>().getEleTy();
+      mlir::dyn_cast<fir::ReferenceType>(halting.getType()).getEleTy();
   mlir::Type i32Ty = builder.getIntegerType(32);
   mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
   auto [fieldRef, ignore] = getFieldRef(builder, loc, flag);
@@ -4248,7 +4251,7 @@ mlir::Value IntrinsicLibrary::genIeeeLogb(mlir::Type resultType,
   //                 : ieee_copy_sign(X, 1.0) // +infinity or NaN
   assert(args.size() == 1);
   mlir::Value realVal = args[0];
-  mlir::FloatType realType = realVal.getType().dyn_cast<mlir::FloatType>();
+  mlir::FloatType realType = mlir::dyn_cast<mlir::FloatType>(realVal.getType());
   int bitWidth = realType.getWidth();
   mlir::Type intType = builder.getIntegerType(realType.getWidth());
   mlir::Value intVal =
@@ -4545,7 +4548,7 @@ mlir::Value IntrinsicLibrary::genIeeeSignbit(mlir::Type resultType,
   // Check if the sign bit of arg X is set.
   assert(args.size() == 1);
   mlir::Value realVal = args[0];
-  mlir::FloatType realType = realVal.getType().dyn_cast<mlir::FloatType>();
+  mlir::FloatType realType = mlir::dyn_cast<mlir::FloatType>(realVal.getType());
   int bitWidth = realType.getWidth();
   if (realType == mlir::FloatType::getBF16(builder.getContext())) {
     // Workaround: can't bitcast or convert real(3) to integer(2) or real(2).
@@ -4642,7 +4645,7 @@ mlir::Value IntrinsicLibrary::genIeeeValue(mlir::Type resultType,
   // A compiler generated call has one argument:
   //  - arg[0] is an index constant
   assert(args.size() == 1 || args.size() == 2);
-  mlir::FloatType realType = resultType.dyn_cast<mlir::FloatType>();
+  mlir::FloatType realType = mlir::dyn_cast<mlir::FloatType>(resultType);
   int bitWidth = realType.getWidth();
   mlir::Type intType = builder.getIntegerType(bitWidth);
   mlir::Type valueTy = bitWidth <= 64 ? intType : builder.getIntegerType(64);
@@ -4884,7 +4887,7 @@ mlir::Value IntrinsicLibrary::genIshft(mlir::Type resultType,
   //                    : I << abs(SHIFT)
   assert(args.size() == 2);
   mlir::Value bitSize = builder.createIntegerConstant(
-      loc, resultType, resultType.cast<mlir::IntegerType>().getWidth());
+      loc, resultType, mlir::cast<mlir::IntegerType>(resultType).getWidth());
   mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
   mlir::Value shift = builder.createConvert(loc, resultType, args[1]);
   mlir::Value absShift = genAbs(resultType, {shift});
@@ -4920,7 +4923,7 @@ mlir::Value IntrinsicLibrary::genIshftc(mlir::Type resultType,
   // Return:  SHIFT == 0 || SIZE == abs(SHIFT) ? I : (unchanged | left | right)
   assert(args.size() == 3);
   mlir::Value bitSize = builder.createIntegerConstant(
-      loc, resultType, resultType.cast<mlir::IntegerType>().getWidth());
+      loc, resultType, mlir::cast<mlir::IntegerType>(resultType).getWidth());
   mlir::Value I = args[0];
   mlir::Value shift = builder.createConvert(loc, resultType, args[1]);
   mlir::Value size =
@@ -5027,7 +5030,7 @@ IntrinsicLibrary::genLoc(mlir::Type resultType,
   mlir::Value box = fir::getBase(args[0]);
   assert(fir::isa_box_type(box.getType()) &&
          "argument must have been lowered to box type");
-  bool isFunc = box.getType().isa<fir::BoxProcType>();
+  bool isFunc = mlir::isa<fir::BoxProcType>(box.getType());
   if (!isOptional(box)) {
     mlir::Value argAddr = getAddrFromBox(builder, loc, args[0], isFunc);
     return builder.createConvert(loc, resultType, argAddr);
@@ -5156,7 +5159,7 @@ IntrinsicLibrary::genMerge(mlir::Type,
   auto convertToStaticType = [&](mlir::Value polymorphic,
                                  mlir::Value other) -> mlir::Value {
     mlir::Type otherType = other.getType();
-    if (otherType.isa<fir::BaseBoxType>())
+    if (mlir::isa<fir::BaseBoxType>(otherType))
       return builder.create<fir::ReboxOp>(loc, otherType, polymorphic,
                                           /*shape*/ mlir::Value{},
                                           /*slice=*/mlir::Value{});
@@ -5209,7 +5212,7 @@ mlir::Value IntrinsicLibrary::genMergeBits(mlir::Type resultType,
 mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
                                      llvm::ArrayRef<mlir::Value> args) {
   assert(args.size() == 2);
-  if (resultType.isa<mlir::IntegerType>())
+  if (mlir::isa<mlir::IntegerType>(resultType))
     return builder.create<mlir::arith::RemSIOp>(loc, args[0], args[1]);
 
   // Use runtime.
@@ -5231,7 +5234,7 @@ mlir::Value IntrinsicLibrary::genModulo(mlir::Type resultType,
   //  - Otherwise, when A/P < 0 and MOD(A,P) !=0, then MODULO(A, P) =
   //    A-FLOOR(A/P)*P = A-(INT(A/P)-1)*P = A-INT(A/P)*P+P = MOD(A,P)+P
   // Note that A/P < 0 if and only if A and P signs are different.
-  if (resultType.isa<mlir::IntegerType>()) {
+  if (mlir::isa<mlir::IntegerType>(resultType)) {
     auto remainder =
         builder.create<mlir::arith::RemSIOp>(loc, args[0], args[1]);
     auto argXor = builder.create<mlir::arith::XOrIOp>(loc, args[0], args[1]);
@@ -5344,7 +5347,7 @@ void IntrinsicLibrary::genMvbits(llvm::ArrayRef<fir::ExtendedValue> args) {
   mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
   mlir::Value ones = builder.createAllOnesInteger(loc, resultType);
   mlir::Value bitSize = builder.createIntegerConstant(
-      loc, resultType, resultType.cast<mlir::IntegerType>().getWidth());
+      loc, resultType, mlir::cast<mlir::IntegerType>(resultType).getWidth());
   auto shiftCount = builder.create<mlir::arith::SubIOp>(loc, bitSize, len);
   auto mask = builder.create<mlir::arith::ShRUIOp>(loc, ones, shiftCount);
   auto unchangedTmp1 = builder.create<mlir::arith::ShLIOp>(loc, mask, topos);
@@ -5628,7 +5631,7 @@ IntrinsicLibrary::genReshape(mlir::Type resultType,
   assert(fir::BoxValue(shape).rank() == 1);
   mlir::Type shapeTy = shape.getType();
   mlir::Type shapeArrTy = fir::dyn_cast_ptrOrBoxEleTy(shapeTy);
-  auto resultRank = shapeArrTy.cast<fir::SequenceType>().getShape()[0];
+  auto resultRank = mlir::cast<fir::SequenceType>(shapeArrTy).getShape()[0];
 
   if (resultRank == fir::SequenceType::getUnknownExtent())
     TODO(loc, "intrinsic: reshape requires computing rank of result");
@@ -5921,7 +5924,7 @@ void IntrinsicLibrary::genSignalSubroutine(
 mlir::Value IntrinsicLibrary::genSign(mlir::Type resultType,
                                       llvm::ArrayRef<mlir::Value> args) {
   assert(args.size() == 2);
-  if (resultType.isa<mlir::IntegerType>()) {
+  if (mlir::isa<mlir::IntegerType>(resultType)) {
     mlir::Value abs = genAbs(resultType, {args[0]});
     mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
     auto neg = builder.create<mlir::arith::SubIOp>(loc, zero, abs);
diff --git a/flang/lib/Optimizer/Builder/MutableBox.cpp b/flang/lib/Optimizer/Builder/MutableBox.cpp
index d4012e9c3d9d..76b920dba869 100644
--- a/flang/lib/Optimizer/Builder/MutableBox.cpp
+++ b/flang/lib/Optimizer/Builder/MutableBox.cpp
@@ -28,7 +28,7 @@ createNewFirBox(fir::FirOpBuilder &builder, mlir::Location loc,
                 const fir::MutableBoxValue &box, mlir::Value addr,
                 mlir::ValueRange lbounds, mlir::ValueRange extents,
                 mlir::ValueRange lengths, mlir::Value tdesc = {}) {
-  if (addr.getType().isa<fir::BaseBoxType>())
+  if (mlir::isa<fir::BaseBoxType>(addr.getType()))
     // The entity is already boxed.
     return builder.createConvert(loc, box.getBoxTy(), addr);
 
@@ -53,20 +53,21 @@ createNewFirBox(fir::FirOpBuilder &builder, mlir::Location loc,
   // error in the embox).
   llvm::SmallVector<mlir::Value> cleanedLengths;
   auto cleanedAddr = addr;
-  if (auto charTy = box.getEleTy().dyn_cast<fir::CharacterType>()) {
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(box.getEleTy())) {
     // Cast address to box type so that both input and output type have
     // unknown or constant lengths.
     auto bt = box.getBaseTy();
     auto addrTy = addr.getType();
-    auto type = addrTy.isa<fir::HeapType>()      ? fir::HeapType::get(bt)
-                : addrTy.isa<fir::PointerType>() ? fir::PointerType::get(bt)
-                                                 : builder.getRefType(bt);
+    auto type = mlir::isa<fir::HeapType>(addrTy) ? fir::HeapType::get(bt)
+                : mlir::isa<fir::PointerType>(addrTy)
+                    ? fir::PointerType::get(bt)
+                    : builder.getRefType(bt);
     cleanedAddr = builder.createConvert(loc, type, addr);
     if (charTy.getLen() == fir::CharacterType::unknownLen())
       cleanedLengths.append(lengths.begin(), lengths.end());
   } else if (fir::isUnlimitedPolymorphicType(box.getBoxTy())) {
-    if (auto charTy = fir::dyn_cast_ptrEleTy(addr.getType())
-                          .dyn_cast<fir::CharacterType>()) {
+    if (auto charTy = mlir::dyn_cast<fir::CharacterType>(
+            fir::dyn_cast_ptrEleTy(addr.getType()))) {
       if (charTy.getLen() == fir::CharacterType::unknownLen())
         cleanedLengths.append(lengths.begin(), lengths.end());
     }
@@ -328,18 +329,18 @@ private:
 mlir::Value fir::factory::createUnallocatedBox(
     fir::FirOpBuilder &builder, mlir::Location loc, mlir::Type boxType,
     mlir::ValueRange nonDeferredParams, mlir::Value typeSourceBox) {
-  auto baseAddrType = boxType.dyn_cast<fir::BaseBoxType>().getEleTy();
+  auto baseAddrType = mlir::dyn_cast<fir::BaseBoxType>(boxType).getEleTy();
   if (!fir::isa_ref_type(baseAddrType))
     baseAddrType = builder.getRefType(baseAddrType);
   auto type = fir::unwrapRefType(baseAddrType);
   auto eleTy = fir::unwrapSequenceType(type);
-  if (auto recTy = eleTy.dyn_cast<fir::RecordType>())
+  if (auto recTy = mlir::dyn_cast<fir::RecordType>(eleTy))
     if (recTy.getNumLenParams() > 0)
       TODO(loc, "creating unallocated fir.box of derived type with length "
                 "parameters");
   auto nullAddr = builder.createNullConstant(loc, baseAddrType);
   mlir::Value shape;
-  if (auto seqTy = type.dyn_cast<fir::SequenceType>()) {
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(type)) {
     auto zero = builder.createIntegerConstant(loc, builder.getIndexType(), 0);
     llvm::SmallVector<mlir::Value> extents(seqTy.getDimension(), zero);
     shape = builder.createShape(
@@ -348,7 +349,7 @@ mlir::Value fir::factory::createUnallocatedBox(
   // Provide dummy length parameters if they are dynamic. If a length parameter
   // is deferred. It is set to zero here and will be set on allocation.
   llvm::SmallVector<mlir::Value> lenParams;
-  if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
     if (charTy.getLen() == fir::CharacterType::unknownLen()) {
       if (!nonDeferredParams.empty()) {
         lenParams.push_back(nonDeferredParams[0]);
@@ -592,7 +593,7 @@ void fir::factory::associateMutableBoxWithRemap(
   auto cast = [&](mlir::Value addr) -> mlir::Value {
     // Cast base addr to new sequence type.
     auto ty = fir::dyn_cast_ptrEleTy(addr.getType());
-    if (auto seqTy = ty.dyn_cast<fir::SequenceType>()) {
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty)) {
       fir::SequenceType::Shape shape(newRank,
                                      fir::SequenceType::getUnknownExtent());
       ty = fir::SequenceType::get(shape, seqTy.getEleTy());
@@ -673,10 +674,10 @@ void fir::factory::disassociateMutableBox(fir::FirOpBuilder &builder,
   if (box.isPolymorphic() && polymorphicSetType) {
     // 7.3.2.3 point 7. The dynamic type of a disassociated pointer is the
     // same as its declared type.
-    auto boxTy = box.getBoxTy().dyn_cast<fir::BaseBoxType>();
+    auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(box.getBoxTy());
     auto eleTy = fir::unwrapPassByRefType(boxTy.getEleTy());
     mlir::Type derivedType = fir::getDerivedType(eleTy);
-    if (auto recTy = derivedType.dyn_cast<fir::RecordType>()) {
+    if (auto recTy = mlir::dyn_cast<fir::RecordType>(derivedType)) {
       fir::runtime::genNullifyDerivedType(builder, loc, box.getAddr(), recTy,
                                           box.rank());
       return;
@@ -690,7 +691,7 @@ getNewLengths(fir::FirOpBuilder &builder, mlir::Location loc,
               const fir::MutableBoxValue &box, mlir::ValueRange lenParams) {
   llvm::SmallVector<mlir::Value> lengths;
   auto idxTy = builder.getIndexType();
-  if (auto charTy = box.getEleTy().dyn_cast<fir::CharacterType>()) {
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(box.getEleTy())) {
     if (charTy.getLen() == fir::CharacterType::unknownLen()) {
       if (box.hasNonDeferredLenParams()) {
         lengths.emplace_back(
@@ -717,7 +718,7 @@ static mlir::Value allocateAndInitNewStorage(fir::FirOpBuilder &builder,
   auto lengths = getNewLengths(builder, loc, box, lenParams);
   auto newStorage = builder.create<fir::AllocMemOp>(
       loc, box.getBaseTy(), allocName, lengths, extents);
-  if (box.getEleTy().isa<fir::RecordType>()) {
+  if (mlir::isa<fir::RecordType>(box.getEleTy())) {
     // TODO: skip runtime initialization if this is not required. Currently,
     // there is no way to know here if a derived type needs it or not. But the
     // information is available at compile time and could be reflected here
@@ -742,7 +743,7 @@ void fir::factory::genInlinedAllocation(
                                               lengths, safeExtents);
   MutablePropertyWriter{builder, loc, box}.updateMutableBox(
       heap, lbounds, safeExtents, lengths);
-  if (box.getEleTy().isa<fir::RecordType>()) {
+  if (mlir::isa<fir::RecordType>(box.getEleTy())) {
     // TODO: skip runtime initialization if this is not required. Currently,
     // there is no way to know here if a derived type needs it or not. But the
     // information is available at compile time and could be reflected here
diff --git a/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
index 160118e2c050..7f09e8822844 100644
--- a/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp
@@ -1119,7 +1119,7 @@ PPCIntrinsicLibrary::genVecAbs(mlir::Type resultType,
     funcOp = builder.createFunction(loc, fname, ftype);
     auto callOp{builder.create<fir::CallOp>(loc, funcOp, argBases[0])};
     return callOp.getResult(0);
-  } else if (auto eleTy = vTypeInfo.eleTy.dyn_cast<mlir::IntegerType>()) {
+  } else if (auto eleTy = mlir::dyn_cast<mlir::IntegerType>(vTypeInfo.eleTy)) {
     // vec_abs(arg1) = max(0 - arg1, arg1)
 
     auto newVecTy{mlir::VectorType::get(vTypeInfo.len, eleTy)};
@@ -1173,12 +1173,13 @@ fir::ExtendedValue PPCIntrinsicLibrary::genVecAddAndMulSubXor(
   assert(args.size() == 2);
   auto argBases{getBasesForArgs(args)};
   auto argsTy{getTypesForArgs(argBases)};
-  assert(argsTy[0].isa<fir::VectorType>() && argsTy[1].isa<fir::VectorType>());
+  assert(mlir::isa<fir::VectorType>(argsTy[0]) &&
+         mlir::isa<fir::VectorType>(argsTy[1]));
 
   auto vecTyInfo{getVecTypeFromFir(argBases[0])};
 
-  const auto isInteger{vecTyInfo.eleTy.isa<mlir::IntegerType>()};
-  const auto isFloat{vecTyInfo.eleTy.isa<mlir::FloatType>()};
+  const auto isInteger{mlir::isa<mlir::IntegerType>(vecTyInfo.eleTy)};
+  const auto isFloat{mlir::isa<mlir::FloatType>(vecTyInfo.eleTy)};
   assert((isInteger || isFloat) && "unknown vector type");
 
   auto vargs{convertVecArgs(builder, loc, vecTyInfo, argBases)};
@@ -1212,7 +1213,7 @@ fir::ExtendedValue PPCIntrinsicLibrary::genVecAddAndMulSubXor(
       arg2 = vargs[1];
     } else if (isFloat) {
       // bitcast the arguments to integer
-      auto wd{vecTyInfo.eleTy.dyn_cast<mlir::FloatType>().getWidth()};
+      auto wd{mlir::dyn_cast<mlir::FloatType>(vecTyInfo.eleTy).getWidth()};
       auto ftype{builder.getIntegerType(wd)};
       auto bcVecTy{mlir::VectorType::get(vecTyInfo.len, ftype)};
       arg1 = builder.create<mlir::vector::BitCastOp>(loc, bcVecTy, vargs[0]);
@@ -1450,7 +1451,7 @@ PPCIntrinsicLibrary::genVecCmp(mlir::Type resultType,
 
   mlir::Value res{nullptr};
 
-  if (auto eTy = vecTyInfo.eleTy.dyn_cast<mlir::IntegerType>()) {
+  if (auto eTy = mlir::dyn_cast<mlir::IntegerType>(vecTyInfo.eleTy)) {
     constexpr int firstArg{0};
     constexpr int secondArg{1};
     std::map<VecOp, std::array<int, 2>> argOrder{
@@ -1559,7 +1560,7 @@ PPCIntrinsicLibrary::genVecConvert(mlir::Type resultType,
   case VecOp::Ctf: {
     assert(args.size() == 2);
     auto convArg{builder.createConvert(loc, i32Ty, argBases[1])};
-    auto eTy{vecTyInfo.eleTy.dyn_cast<mlir::IntegerType>()};
+    auto eTy{mlir::dyn_cast<mlir::IntegerType>(vecTyInfo.eleTy)};
     assert(eTy && "Unsupported vector type");
     const auto isUnsigned{eTy.isUnsignedInteger()};
     const auto width{eTy.getWidth()};
@@ -1587,10 +1588,9 @@ PPCIntrinsicLibrary::genVecConvert(mlir::Type resultType,
                       : builder.create<mlir::LLVM::SIToFPOp>(loc, ty, vArg1)};
 
       // construct vector<1./(1<<arg1), 1.0/(1<<arg1)>
-      auto constInt{
+      auto constInt{mlir::dyn_cast_or_null<mlir::IntegerAttr>(
           mlir::dyn_cast<mlir::arith::ConstantOp>(argBases[1].getDefiningOp())
-              .getValue()
-              .dyn_cast_or_null<mlir::IntegerAttr>()};
+              .getValue())};
       assert(constInt && "expected integer constant argument");
       double f{1.0 / (1 << constInt.getInt())};
       llvm::SmallVector<double> vals{f, f};
@@ -1815,7 +1815,7 @@ static mlir::Value addOffsetToAddress(fir::FirOpBuilder &builder,
 static mlir::Value reverseVectorElements(fir::FirOpBuilder &builder,
                                          mlir::Location loc, mlir::Value v,
                                          int64_t len) {
-  assert(v.getType().isa<mlir::VectorType>());
+  assert(mlir::isa<mlir::VectorType>(v.getType()));
   assert(len > 0);
   llvm::SmallVector<int64_t, 16> mask;
   for (int64_t i = 0; i < len; ++i) {
@@ -2144,10 +2144,9 @@ PPCIntrinsicLibrary::genVecPerm(mlir::Type resultType,
   }
   case VecOp::Permi: {
     // arg3 is a constant
-    auto constIntOp{
+    auto constIntOp{mlir::dyn_cast_or_null<mlir::IntegerAttr>(
         mlir::dyn_cast<mlir::arith::ConstantOp>(argBases[2].getDefiningOp())
-            .getValue()
-            .dyn_cast_or_null<mlir::IntegerAttr>()};
+            .getValue())};
     assert(constIntOp && "expected integer constant argument");
     auto constInt{constIntOp.getInt()};
     // arg1, arg2, and result type share same VecTypeInfo
@@ -2321,10 +2320,9 @@ PPCIntrinsicLibrary::genVecShift(mlir::Type resultType,
     }
   } else if (vop == VecOp::Sld || vop == VecOp::Sldw) {
     assert(args.size() == 3);
-    auto constIntOp =
+    auto constIntOp = mlir::dyn_cast_or_null<mlir::IntegerAttr>(
         mlir::dyn_cast<mlir::arith::ConstantOp>(argBases[2].getDefiningOp())
-            .getValue()
-            .dyn_cast_or_null<mlir::IntegerAttr>();
+            .getValue());
     assert(constIntOp && "expected integer constant argument");
 
     // Bitcast to vector<16xi8>
@@ -2797,16 +2795,16 @@ void PPCIntrinsicLibrary::genMmaIntr(llvm::ArrayRef<fir::ExtendedValue> args) {
     auto vType{v.getType()};
     mlir::Type targetType{intrFuncType.getInput(j)};
     if (vType != targetType) {
-      if (targetType.isa<mlir::VectorType>()) {
+      if (mlir::isa<mlir::VectorType>(targetType)) {
         // Perform vector type conversion for arguments passed by value.
-        auto eleTy{vType.dyn_cast<fir::VectorType>().getEleTy()};
-        auto len{vType.dyn_cast<fir::VectorType>().getLen()};
+        auto eleTy{mlir::dyn_cast<fir::VectorType>(vType).getEleTy()};
+        auto len{mlir::dyn_cast<fir::VectorType>(vType).getLen()};
         mlir::VectorType mlirType = mlir::VectorType::get(len, eleTy);
         auto v0{builder.createConvert(loc, mlirType, v)};
         auto v1{builder.create<mlir::vector::BitCastOp>(loc, targetType, v0)};
         intrArgs.push_back(v1);
-      } else if (targetType.isa<mlir::IntegerType>() &&
-                 vType.isa<mlir::IntegerType>()) {
+      } else if (mlir::isa<mlir::IntegerType>(targetType) &&
+                 mlir::isa<mlir::IntegerType>(vType)) {
         auto v0{builder.createConvert(loc, targetType, v)};
         intrArgs.push_back(v0);
       } else {
@@ -2861,7 +2859,7 @@ void PPCIntrinsicLibrary::genVecStore(llvm::ArrayRef<fir::ExtendedValue> args) {
     if (arg1TyInfo.isFloat32()) {
       stTy = mlir::VectorType::get(len, i32ty);
       fname = "llvm.ppc.altivec.stvewx";
-    } else if (arg1TyInfo.eleTy.isa<mlir::IntegerType>()) {
+    } else if (mlir::isa<mlir::IntegerType>(arg1TyInfo.eleTy)) {
       stTy = mlir::VectorType::get(len, mlir::IntegerType::get(context, width));
 
       switch (width) {
diff --git a/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp b/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
index abff0e150ab4..70a88ff18cb1 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
@@ -27,7 +27,7 @@ mlir::Value fir::runtime::genMoveAlloc(fir::FirOpBuilder &builder,
   if (fir::isPolymorphicType(from.getType()) &&
       !fir::isUnlimitedPolymorphicType(from.getType())) {
     fir::ClassType clTy =
-        fir::dyn_cast_ptrEleTy(from.getType()).dyn_cast<fir::ClassType>();
+        mlir::dyn_cast<fir::ClassType>(fir::dyn_cast_ptrEleTy(from.getType()));
     mlir::Type derivedType = fir::unwrapInnerType(clTy.getEleTy());
     declaredTypeDesc =
         builder.create<fir::TypeDescOp>(loc, mlir::TypeAttr::get(derivedType));
diff --git a/flang/lib/Optimizer/Builder/Runtime/Character.cpp b/flang/lib/Optimizer/Builder/Runtime/Character.cpp
index f3663439fdd5..b16819915d5a 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Character.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Character.cpp
@@ -39,15 +39,15 @@ static void genCharacterSearch(FN func, fir::FirOpBuilder &builder,
 
 /// Helper function to recover the KIND from the FIR type.
 static int discoverKind(mlir::Type ty) {
-  if (auto charTy = ty.dyn_cast<fir::CharacterType>())
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(ty))
     return charTy.getFKind();
   if (auto eleTy = fir::dyn_cast_ptrEleTy(ty))
     return discoverKind(eleTy);
-  if (auto arrTy = ty.dyn_cast<fir::SequenceType>())
+  if (auto arrTy = mlir::dyn_cast<fir::SequenceType>(ty))
     return discoverKind(arrTy.getEleTy());
-  if (auto boxTy = ty.dyn_cast<fir::BoxCharType>())
+  if (auto boxTy = mlir::dyn_cast<fir::BoxCharType>(ty))
     return discoverKind(boxTy.getEleTy());
-  if (auto boxTy = ty.dyn_cast<fir::BoxType>())
+  if (auto boxTy = mlir::dyn_cast<fir::BoxType>(ty))
     return discoverKind(boxTy.getEleTy());
   llvm_unreachable("unexpected character type");
 }
diff --git a/flang/lib/Optimizer/Builder/Runtime/EnvironmentDefaults.cpp b/flang/lib/Optimizer/Builder/Runtime/EnvironmentDefaults.cpp
index a11b9339681e..6e280ac0c06c 100755
--- a/flang/lib/Optimizer/Builder/Runtime/EnvironmentDefaults.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/EnvironmentDefaults.cpp
@@ -13,7 +13,7 @@
 #include "flang/Optimizer/Support/InternalNames.h"
 #include "llvm/ADT/ArrayRef.h"
 
-void fir::runtime::genEnvironmentDefaults(
+fir::GlobalOp fir::runtime::genEnvironmentDefaults(
     fir::FirOpBuilder &builder, mlir::Location loc,
     const std::vector<Fortran::lower::EnvironmentDefault> &envDefaults) {
   std::string envDefaultListPtrName =
@@ -34,14 +34,13 @@ void fir::runtime::genEnvironmentDefaults(
 
   // If no defaults were specified, initialize with a null pointer.
   if (envDefaults.empty()) {
-    builder.createGlobalConstant(
+    return builder.createGlobalConstant(
         loc, envDefaultListRefTy, envDefaultListPtrName,
         [&](fir::FirOpBuilder &builder) {
           mlir::Value nullVal =
               builder.createNullConstant(loc, envDefaultListRefTy);
           builder.create<fir::HasValueOp>(loc, nullVal);
         });
-    return;
   }
 
   // Create the Item list.
@@ -99,7 +98,7 @@ void fir::runtime::genEnvironmentDefaults(
       envDefaultListBuilder, linkOnce);
 
   // Define the pointer to the list used by the runtime.
-  builder.createGlobalConstant(
+  return builder.createGlobalConstant(
       loc, envDefaultListRefTy, envDefaultListPtrName,
       [&](fir::FirOpBuilder &builder) {
         mlir::Value addr = builder.create<fir::AddrOfOp>(
diff --git a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
index 57c47da0f3f8..8b78a1688c73 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
@@ -228,7 +228,8 @@ void fir::runtime::genSystemClock(fir::FirOpBuilder &builder,
     fir::IfOp ifOp{};
     const bool isOptionalArg =
         fir::valueHasFirAttribute(arg, fir::getOptionalAttrName());
-    if (type.dyn_cast<fir::PointerType>() || type.dyn_cast<fir::HeapType>()) {
+    if (mlir::dyn_cast<fir::PointerType>(type) ||
+        mlir::dyn_cast<fir::HeapType>(type)) {
       // Check for a disassociated pointer or an unallocated allocatable.
       assert(!isOptionalArg && "invalid optional argument");
       ifOp = builder.create<fir::IfOp>(loc, builder.genIsNotNullAddr(loc, arg),
@@ -242,7 +243,8 @@ void fir::runtime::genSystemClock(fir::FirOpBuilder &builder,
       builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
     mlir::Type kindTy = func.getFunctionType().getInput(0);
     int integerKind = 8;
-    if (auto intType = fir::unwrapRefType(type).dyn_cast<mlir::IntegerType>())
+    if (auto intType =
+            mlir::dyn_cast<mlir::IntegerType>(fir::unwrapRefType(type)))
       integerKind = intType.getWidth() / 8;
     mlir::Value kind = builder.createIntegerConstant(loc, kindTy, integerKind);
     mlir::Value res =
diff --git a/flang/lib/Optimizer/Builder/Runtime/Main.cpp b/flang/lib/Optimizer/Builder/Runtime/Main.cpp
new file mode 100644
index 000000000000..3b24fbca9cdb
--- /dev/null
+++ b/flang/lib/Optimizer/Builder/Runtime/Main.cpp
@@ -0,0 +1,62 @@
+//===-- Main.cpp - generate main runtime API calls --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/Runtime/Main.h"
+#include "flang/Optimizer/Builder/BoxValue.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Runtime/main.h"
+#include "flang/Runtime/stop.h"
+
+using namespace Fortran::runtime;
+
+/// Create a `int main(...)` that calls the Fortran entry point
+void fir::runtime::genMain(fir::FirOpBuilder &builder, mlir::Location loc,
+                           fir::GlobalOp &env) {
+  auto *context = builder.getContext();
+  auto argcTy = builder.getDefaultIntegerType();
+  auto ptrTy = mlir::LLVM::LLVMPointerType::get(context);
+
+  // void ProgramStart(int argc, char** argv, char** envp,
+  //                   _QQEnvironmentDefaults* env)
+  auto startFn = builder.createFunction(
+      loc, RTNAME_STRING(ProgramStart),
+      mlir::FunctionType::get(context, {argcTy, ptrTy, ptrTy, ptrTy}, {}));
+  // void ProgramStop()
+  auto stopFn =
+      builder.createFunction(loc, RTNAME_STRING(ProgramEndStatement),
+                             mlir::FunctionType::get(context, {}, {}));
+
+  // int main(int argc, char** argv, char** envp)
+  auto mainFn = builder.createFunction(
+      loc, "main",
+      mlir::FunctionType::get(context, {argcTy, ptrTy, ptrTy}, argcTy));
+  // void _QQmain()
+  auto qqMainFn = builder.createFunction(
+      loc, "_QQmain", mlir::FunctionType::get(context, {}, {}));
+
+  mainFn.setPublic();
+
+  auto *block = mainFn.addEntryBlock();
+  mlir::OpBuilder::InsertionGuard insertGuard(builder);
+  builder.setInsertionPointToStart(block);
+
+  llvm::SmallVector<mlir::Value, 4> args(block->getArguments());
+  auto envAddr =
+      builder.create<fir::AddrOfOp>(loc, env.getType(), env.getSymbol());
+  args.push_back(envAddr);
+
+  builder.create<fir::CallOp>(loc, startFn, args);
+  builder.create<fir::CallOp>(loc, qqMainFn);
+  builder.create<fir::CallOp>(loc, stopFn);
+
+  mlir::Value ret = builder.createIntegerConstant(loc, argcTy, 0);
+  builder.create<mlir::func::ReturnOp>(loc, ret);
+}
diff --git a/flang/lib/Optimizer/Builder/Runtime/Ragged.cpp b/flang/lib/Optimizer/Builder/Runtime/Ragged.cpp
index 4d33282a35d9..e5d0fb0fb27a 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Ragged.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Ragged.cpp
@@ -32,7 +32,8 @@ void fir::runtime::genRaggedArrayAllocate(mlir::Location loc,
   // Position of the bufferPointer in the header struct.
   auto one = builder.createIntegerConstant(loc, i32Ty, 1);
   auto eleTy = fir::unwrapSequenceType(fir::unwrapRefType(header.getType()));
-  auto ptrTy = builder.getRefType(eleTy.cast<mlir::TupleType>().getType(1));
+  auto ptrTy =
+      builder.getRefType(mlir::cast<mlir::TupleType>(eleTy).getType(1));
   auto ptr = builder.create<fir::CoordinateOp>(loc, ptrTy, header, one);
   auto heap = builder.create<fir::LoadOp>(loc, ptr);
   auto cmp = builder.genIsNullAddr(loc, heap);
diff --git a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
index 66fbaddcbda1..d4076067bf10 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
@@ -666,7 +666,7 @@ void fir::runtime::genMaxloc(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
-  auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+  auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   fir::factory::CharacterExprHelper charHelper{builder, loc};
   if (eleTy.isF32())
     func = fir::runtime::getRuntimeFunc<mkRTKey(MaxlocReal4)>(loc, builder);
@@ -713,7 +713,7 @@ mlir::Value fir::runtime::genMaxval(fir::FirOpBuilder &builder,
   mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
-  auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+  auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0);
 
   if (eleTy.isF32())
@@ -781,7 +781,7 @@ void fir::runtime::genMinloc(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
-  auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+  auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   fir::factory::CharacterExprHelper charHelper{builder, loc};
   if (eleTy.isF32())
     func = fir::runtime::getRuntimeFunc<mkRTKey(MinlocReal4)>(loc, builder);
@@ -853,7 +853,7 @@ mlir::Value fir::runtime::genMinval(fir::FirOpBuilder &builder,
   mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
-  auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+  auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0);
 
   if (eleTy.isF32())
@@ -895,7 +895,7 @@ void fir::runtime::genNorm2Dim(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
-  auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+  auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   if (eleTy.isF128())
     func = fir::runtime::getRuntimeFunc<ForcedNorm2DimReal16>(loc, builder);
   else
@@ -917,7 +917,7 @@ mlir::Value fir::runtime::genNorm2(fir::FirOpBuilder &builder,
   mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
-  auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+  auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0);
 
   if (eleTy.isF32())
@@ -968,7 +968,7 @@ mlir::Value fir::runtime::genProduct(fir::FirOpBuilder &builder,
   mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
-  auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+  auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0);
 
   if (eleTy.isF32())
@@ -1069,7 +1069,7 @@ mlir::Value fir::runtime::genDotProduct(fir::FirOpBuilder &builder,
   else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(16)))
     func =
         fir::runtime::getRuntimeFunc<ForcedDotProductInteger16>(loc, builder);
-  else if (eleTy.isa<fir::LogicalType>())
+  else if (mlir::isa<fir::LogicalType>(eleTy))
     func =
         fir::runtime::getRuntimeFunc<mkRTKey(DotProductLogical)>(loc, builder);
   else
@@ -1111,7 +1111,7 @@ mlir::Value fir::runtime::genSum(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::func::FuncOp func;
   auto ty = arrayBox.getType();
   auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
-  auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+  auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
   auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0);
 
   if (eleTy.isF32())
@@ -1173,7 +1173,7 @@ mlir::Value fir::runtime::genSum(fir::FirOpBuilder &builder, mlir::Location loc,
     mlir::func::FuncOp func;                                                   \
     auto ty = arrayBox.getType();                                              \
     auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);                              \
-    auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();                   \
+    auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();              \
     auto dim = builder.createIntegerConstant(loc, builder.getIndexType(), 0);  \
                                                                                \
     if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(1)))            \
diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index 48173033ecbe..5229d40f2250 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -51,9 +51,9 @@ public:
   /// not at all depending on the implementation target's characteristics and
   /// preference.
   bool needsConversion(mlir::Type ty) {
-    if (ty.isa<BoxProcType>())
+    if (mlir::isa<BoxProcType>(ty))
       return true;
-    if (auto funcTy = ty.dyn_cast<mlir::FunctionType>()) {
+    if (auto funcTy = mlir::dyn_cast<mlir::FunctionType>(ty)) {
       for (auto t : funcTy.getInputs())
         if (needsConversion(t))
           return true;
@@ -62,13 +62,13 @@ public:
           return true;
       return false;
     }
-    if (auto tupleTy = ty.dyn_cast<mlir::TupleType>()) {
+    if (auto tupleTy = mlir::dyn_cast<mlir::TupleType>(ty)) {
       for (auto t : tupleTy.getTypes())
         if (needsConversion(t))
           return true;
       return false;
     }
-    if (auto recTy = ty.dyn_cast<RecordType>()) {
+    if (auto recTy = mlir::dyn_cast<RecordType>(ty)) {
       auto visited = visitedTypes.find(ty);
       if (visited != visitedTypes.end())
         return visited->second;
@@ -97,11 +97,11 @@ public:
         visitedTypes.find(ty)->second = result;
       return result;
     }
-    if (auto boxTy = ty.dyn_cast<BaseBoxType>())
+    if (auto boxTy = mlir::dyn_cast<BaseBoxType>(ty))
       return needsConversion(boxTy.getEleTy());
     if (isa_ref_type(ty))
       return needsConversion(unwrapRefType(ty));
-    if (auto t = ty.dyn_cast<SequenceType>())
+    if (auto t = mlir::dyn_cast<SequenceType>(ty))
       return needsConversion(unwrapSequenceType(ty));
     return false;
   }
@@ -246,7 +246,7 @@ public:
           if (typeConverter.needsConversion(ty)) {
             rewriter.startOpModification(func);
             auto toTy =
-                typeConverter.convertType(ty).cast<mlir::FunctionType>();
+                mlir::cast<mlir::FunctionType>(typeConverter.convertType(ty));
             if (!func.empty())
               for (auto e : llvm::enumerate(toTy.getInputs())) {
                 unsigned i = e.index();
@@ -263,7 +263,7 @@ public:
           // Rewrite all `fir.emboxproc` ops to either `fir.convert` or a thunk
           // as required.
           mlir::Type toTy = typeConverter.convertType(
-              embox.getType().cast<BoxProcType>().getEleTy());
+              mlir::cast<BoxProcType>(embox.getType()).getEleTy());
           rewriter.setInsertionPoint(embox);
           if (embox.getHost()) {
             // Create the thunk.
diff --git a/flang/lib/Optimizer/CodeGen/CGOps.cpp b/flang/lib/Optimizer/CodeGen/CGOps.cpp
index c3bcdeaf86db..44d07d26dd2b 100644
--- a/flang/lib/Optimizer/CodeGen/CGOps.cpp
+++ b/flang/lib/Optimizer/CodeGen/CGOps.cpp
@@ -41,24 +41,24 @@ unsigned fir::cg::XEmboxOp::getOutRank() {
 }
 
 unsigned fir::cg::XReboxOp::getOutRank() {
-  if (auto seqTy =
-          fir::dyn_cast_ptrOrBoxEleTy(getType()).dyn_cast<fir::SequenceType>())
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(
+          fir::dyn_cast_ptrOrBoxEleTy(getType())))
     return seqTy.getDimension();
   return 0;
 }
 
 unsigned fir::cg::XReboxOp::getRank() {
-  if (auto seqTy = fir::dyn_cast_ptrOrBoxEleTy(getBox().getType())
-                       .dyn_cast<fir::SequenceType>())
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(
+          fir::dyn_cast_ptrOrBoxEleTy(getBox().getType())))
     return seqTy.getDimension();
   return 0;
 }
 
 unsigned fir::cg::XArrayCoorOp::getRank() {
   auto memrefTy = getMemref().getType();
-  if (memrefTy.isa<fir::BaseBoxType>())
-    if (auto seqty =
-            fir::dyn_cast_ptrOrBoxEleTy(memrefTy).dyn_cast<fir::SequenceType>())
+  if (mlir::isa<fir::BaseBoxType>(memrefTy))
+    if (auto seqty = mlir::dyn_cast<fir::SequenceType>(
+            fir::dyn_cast_ptrOrBoxEleTy(memrefTy)))
       return seqty.getDimension();
   return getShape().size();
 }
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 921eac2f8f4b..b4705aa47992 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -101,7 +101,7 @@ static int64_t getConstantIntValue(mlir::Value val) {
 }
 
 static unsigned getTypeDescFieldId(mlir::Type ty) {
-  auto isArray = fir::dyn_cast_ptrOrBoxEleTy(ty).isa<fir::SequenceType>();
+  auto isArray = mlir::isa<fir::SequenceType>(fir::dyn_cast_ptrOrBoxEleTy(ty));
   return isArray ? kOptTypePtrPosInBox : kDimsPosInBox;
 }
 static unsigned getLenParamFieldId(mlir::Type ty) {
@@ -147,7 +147,7 @@ genAllocationScaleSize(OP op, mlir::Type ity,
                        mlir::ConversionPatternRewriter &rewriter) {
   mlir::Location loc = op.getLoc();
   mlir::Type dataTy = op.getInType();
-  auto seqTy = dataTy.dyn_cast<fir::SequenceType>();
+  auto seqTy = mlir::dyn_cast<fir::SequenceType>(dataTy);
   fir::SequenceType::Extent constSize = 1;
   if (seqTy) {
     int constRows = seqTy.getConstantRows();
@@ -191,13 +191,13 @@ struct AllocaOpConversion : public fir::FIROpConversion<fir::AllocaOp> {
       for (; i < end; ++i)
         lenParams.push_back(operands[i]);
       mlir::Type scalarType = fir::unwrapSequenceType(alloc.getInType());
-      if (auto chrTy = scalarType.dyn_cast<fir::CharacterType>()) {
+      if (auto chrTy = mlir::dyn_cast<fir::CharacterType>(scalarType)) {
         fir::CharacterType rawCharTy = fir::CharacterType::getUnknownLen(
             chrTy.getContext(), chrTy.getFKind());
         llvmObjectType = convertType(rawCharTy);
         assert(end == 1);
         size = integerCast(loc, rewriter, ity, lenParams[0]);
-      } else if (auto recTy = scalarType.dyn_cast<fir::RecordType>()) {
+      } else if (auto recTy = mlir::dyn_cast<fir::RecordType>(scalarType)) {
         mlir::LLVM::LLVMFuncOp memSizeFn =
             getDependentTypeMemSizeFn(recTy, alloc, rewriter);
         if (!memSizeFn)
@@ -265,7 +265,8 @@ struct BoxAddrOpConversion : public fir::FIROpConversion<fir::BoxAddrOp> {
                   mlir::ConversionPatternRewriter &rewriter) const override {
     mlir::Value a = adaptor.getOperands()[0];
     auto loc = boxaddr.getLoc();
-    if (auto argty = boxaddr.getVal().getType().dyn_cast<fir::BaseBoxType>()) {
+    if (auto argty =
+            mlir::dyn_cast<fir::BaseBoxType>(boxaddr.getVal().getType())) {
       TypePair boxTyPair = getBoxTypePair(argty);
       rewriter.replaceOp(boxaddr,
                          getBaseAddrFromBox(loc, boxTyPair, a, rewriter));
@@ -476,24 +477,25 @@ struct StringLitOpConversion : public fir::FIROpConversion<fir::StringLitOp> {
                   mlir::ConversionPatternRewriter &rewriter) const override {
     auto ty = convertType(constop.getType());
     auto attr = constop.getValue();
-    if (attr.isa<mlir::StringAttr>()) {
+    if (mlir::isa<mlir::StringAttr>(attr)) {
       rewriter.replaceOpWithNewOp<mlir::LLVM::ConstantOp>(constop, ty, attr);
       return mlir::success();
     }
 
-    auto charTy = constop.getType().cast<fir::CharacterType>();
+    auto charTy = mlir::cast<fir::CharacterType>(constop.getType());
     unsigned bits = lowerTy().characterBitsize(charTy);
     mlir::Type intTy = rewriter.getIntegerType(bits);
     mlir::Location loc = constop.getLoc();
     mlir::Value cst = rewriter.create<mlir::LLVM::UndefOp>(loc, ty);
-    if (auto arr = attr.dyn_cast<mlir::DenseElementsAttr>()) {
+    if (auto arr = mlir::dyn_cast<mlir::DenseElementsAttr>(attr)) {
       cst = rewriter.create<mlir::LLVM::ConstantOp>(loc, ty, arr);
-    } else if (auto arr = attr.dyn_cast<mlir::ArrayAttr>()) {
+    } else if (auto arr = mlir::dyn_cast<mlir::ArrayAttr>(attr)) {
       for (auto a : llvm::enumerate(arr.getValue())) {
         // convert each character to a precise bitsize
         auto elemAttr = mlir::IntegerAttr::get(
             intTy,
-            a.value().cast<mlir::IntegerAttr>().getValue().zextOrTrunc(bits));
+            mlir::cast<mlir::IntegerAttr>(a.value()).getValue().zextOrTrunc(
+                bits));
         auto elemCst =
             rewriter.create<mlir::LLVM::ConstantOp>(loc, intTy, elemAttr);
         cst = rewriter.create<mlir::LLVM::InsertValueOp>(loc, cst, elemCst,
@@ -528,9 +530,9 @@ struct CallOpConversion : public fir::FIROpConversion<fir::CallOp> {
 } // namespace
 
 static mlir::Type getComplexEleTy(mlir::Type complex) {
-  if (auto cc = complex.dyn_cast<mlir::ComplexType>())
+  if (auto cc = mlir::dyn_cast<mlir::ComplexType>(complex))
     return cc.getElementType();
-  return complex.cast<fir::ComplexType>().getElementType();
+  return mlir::cast<fir::ComplexType>(complex).getElementType();
 }
 
 namespace {
@@ -599,7 +601,7 @@ struct ConstcOpConversion : public fir::FIROpConversion<fir::ConstcOp> {
   }
 
   inline llvm::APFloat getValue(mlir::Attribute attr) const {
-    return attr.cast<fir::RealAttr>().getValue();
+    return mlir::cast<fir::RealAttr>(attr).getValue();
   }
 };
 
@@ -608,7 +610,7 @@ struct ConvertOpConversion : public fir::FIROpConversion<fir::ConvertOp> {
   using FIROpConversion::FIROpConversion;
 
   static bool isFloatingPointTy(mlir::Type ty) {
-    return ty.isa<mlir::FloatType>();
+    return mlir::isa<mlir::FloatType>(ty);
   }
 
   mlir::LogicalResult
@@ -628,7 +630,8 @@ struct ConvertOpConversion : public fir::FIROpConversion<fir::ConvertOp> {
     auto loc = convert.getLoc();
     auto i1Type = mlir::IntegerType::get(convert.getContext(), 1);
 
-    if (fromFirTy.isa<fir::LogicalType>() || toFirTy.isa<fir::LogicalType>()) {
+    if (mlir::isa<fir::LogicalType>(fromFirTy) ||
+        mlir::isa<fir::LogicalType>(toFirTy)) {
       // By specification fir::LogicalType value may be any number,
       // where non-zero value represents .true. and zero value represents
       // .false.
@@ -641,7 +644,8 @@ struct ConvertOpConversion : public fir::FIROpConversion<fir::ConvertOp> {
       // Conversion from narrow logical to wide logical may be implemented
       // as a zero or sign extension of the input, but it may use value
       // normalization as well.
-      if (!fromTy.isa<mlir::IntegerType>() || !toTy.isa<mlir::IntegerType>())
+      if (!mlir::isa<mlir::IntegerType>(fromTy) ||
+          !mlir::isa<mlir::IntegerType>(toTy))
         return mlir::emitError(loc)
                << "unsupported types for logical conversion: " << fromTy
                << " -> " << toTy;
@@ -722,13 +726,13 @@ struct ConvertOpConversion : public fir::FIROpConversion<fir::ConvertOp> {
         rewriter.replaceOp(convert, v);
         return mlir::success();
       }
-      if (toTy.isa<mlir::IntegerType>()) {
+      if (mlir::isa<mlir::IntegerType>(toTy)) {
         rewriter.replaceOpWithNewOp<mlir::LLVM::FPToSIOp>(convert, toTy, op0);
         return mlir::success();
       }
-    } else if (fromTy.isa<mlir::IntegerType>()) {
+    } else if (mlir::isa<mlir::IntegerType>(fromTy)) {
       // Integer to integer conversion.
-      if (toTy.isa<mlir::IntegerType>()) {
+      if (mlir::isa<mlir::IntegerType>(toTy)) {
         auto fromBits = mlir::LLVM::getPrimitiveTypeSizeInBits(fromTy);
         auto toBits = mlir::LLVM::getPrimitiveTypeSizeInBits(toTy);
         assert(fromBits != toBits);
@@ -749,18 +753,18 @@ struct ConvertOpConversion : public fir::FIROpConversion<fir::ConvertOp> {
         return mlir::success();
       }
       // Integer to pointer conversion.
-      if (toTy.isa<mlir::LLVM::LLVMPointerType>()) {
+      if (mlir::isa<mlir::LLVM::LLVMPointerType>(toTy)) {
         rewriter.replaceOpWithNewOp<mlir::LLVM::IntToPtrOp>(convert, toTy, op0);
         return mlir::success();
       }
-    } else if (fromTy.isa<mlir::LLVM::LLVMPointerType>()) {
+    } else if (mlir::isa<mlir::LLVM::LLVMPointerType>(fromTy)) {
       // Pointer to integer conversion.
-      if (toTy.isa<mlir::IntegerType>()) {
+      if (mlir::isa<mlir::IntegerType>(toTy)) {
         rewriter.replaceOpWithNewOp<mlir::LLVM::PtrToIntOp>(convert, toTy, op0);
         return mlir::success();
       }
       // Pointer to pointer conversion.
-      if (toTy.isa<mlir::LLVM::LLVMPointerType>()) {
+      if (mlir::isa<mlir::LLVM::LLVMPointerType>(toTy)) {
         rewriter.replaceOpWithNewOp<mlir::LLVM::BitcastOp>(convert, toTy, op0);
         return mlir::success();
       }
@@ -842,11 +846,11 @@ struct EmboxCharOpConversion : public fir::FIROpConversion<fir::EmboxCharOp> {
     auto llvmStruct = rewriter.create<mlir::LLVM::UndefOp>(loc, llvmStructTy);
 
     mlir::Type lenTy =
-        llvmStructTy.cast<mlir::LLVM::LLVMStructType>().getBody()[1];
+        mlir::cast<mlir::LLVM::LLVMStructType>(llvmStructTy).getBody()[1];
     mlir::Value lenAfterCast = integerCast(loc, rewriter, lenTy, charBufferLen);
 
     mlir::Type addrTy =
-        llvmStructTy.cast<mlir::LLVM::LLVMStructType>().getBody()[0];
+        mlir::cast<mlir::LLVM::LLVMStructType>(llvmStructTy).getBody()[0];
     if (addrTy != charBuffer.getType())
       charBuffer =
           rewriter.create<mlir::LLVM::BitcastOp>(loc, addrTy, charBuffer);
@@ -979,9 +983,10 @@ static mlir::SymbolRefAttr getFree(fir::FreeMemOp op,
 
 static unsigned getDimension(mlir::LLVM::LLVMArrayType ty) {
   unsigned result = 1;
-  for (auto eleTy = ty.getElementType().dyn_cast<mlir::LLVM::LLVMArrayType>();
-       eleTy;
-       eleTy = eleTy.getElementType().dyn_cast<mlir::LLVM::LLVMArrayType>())
+  for (auto eleTy =
+           mlir::dyn_cast<mlir::LLVM::LLVMArrayType>(ty.getElementType());
+       eleTy; eleTy = mlir::dyn_cast<mlir::LLVM::LLVMArrayType>(
+                  eleTy.getElementType()))
     ++result;
   return result;
 }
@@ -1052,9 +1057,9 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
 
   static int getCFIAttr(fir::BaseBoxType boxTy) {
     auto eleTy = boxTy.getEleTy();
-    if (eleTy.isa<fir::PointerType>())
+    if (mlir::isa<fir::PointerType>(eleTy))
       return CFI_attribute_pointer;
-    if (eleTy.isa<fir::HeapType>())
+    if (mlir::isa<fir::HeapType>(eleTy))
       return CFI_attribute_allocatable;
     return CFI_attribute_other;
   }
@@ -1082,27 +1087,29 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
     auto i64Ty = mlir::IntegerType::get(rewriter.getContext(), 64);
     if (auto eleTy = fir::dyn_cast_ptrEleTy(boxEleTy))
       boxEleTy = eleTy;
-    if (auto seqTy = boxEleTy.dyn_cast<fir::SequenceType>())
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(boxEleTy))
       return getSizeAndTypeCode(loc, rewriter, seqTy.getEleTy(), lenParams);
-    if (boxEleTy.isa<mlir::NoneType>()) // unlimited polymorphic or assumed type
+    if (mlir::isa<mlir::NoneType>(
+            boxEleTy)) // unlimited polymorphic or assumed type
       return {rewriter.create<mlir::LLVM::ConstantOp>(loc, i64Ty, 0),
               this->genConstantOffset(loc, rewriter, CFI_type_other)};
     mlir::Value typeCodeVal = this->genConstantOffset(
         loc, rewriter,
         fir::getTypeCode(boxEleTy, this->lowerTy().getKindMap()));
-    if (fir::isa_integer(boxEleTy) || boxEleTy.dyn_cast<fir::LogicalType>() ||
-        fir::isa_real(boxEleTy) || fir::isa_complex(boxEleTy))
+    if (fir::isa_integer(boxEleTy) ||
+        mlir::dyn_cast<fir::LogicalType>(boxEleTy) || fir::isa_real(boxEleTy) ||
+        fir::isa_complex(boxEleTy))
       return {genTypeStrideInBytes(loc, i64Ty, rewriter,
                                    this->convertType(boxEleTy)),
               typeCodeVal};
-    if (auto charTy = boxEleTy.dyn_cast<fir::CharacterType>())
+    if (auto charTy = mlir::dyn_cast<fir::CharacterType>(boxEleTy))
       return {getCharacterByteSize(loc, rewriter, charTy, lenParams),
               typeCodeVal};
     if (fir::isa_ref_type(boxEleTy)) {
       auto ptrTy = ::getLlvmPtrType(rewriter.getContext());
       return {genTypeStrideInBytes(loc, i64Ty, rewriter, ptrTy), typeCodeVal};
     }
-    if (boxEleTy.isa<fir::RecordType>())
+    if (mlir::isa<fir::RecordType>(boxEleTy))
       return {genTypeStrideInBytes(loc, i64Ty, rewriter,
                                    this->convertType(boxEleTy)),
               typeCodeVal};
@@ -1211,8 +1218,8 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
       if (!typeDesc) {
         if (useInputType) {
           mlir::Type innerType = fir::unwrapInnerType(inputType);
-          if (innerType && innerType.template isa<fir::RecordType>()) {
-            auto recTy = innerType.template dyn_cast<fir::RecordType>();
+          if (innerType && mlir::isa<fir::RecordType>(innerType)) {
+            auto recTy = mlir::dyn_cast<fir::RecordType>(innerType);
             typeDesc = getTypeDescriptor(mod, rewriter, loc, recTy);
           } else {
             // Unlimited polymorphic type descriptor with no record type. Set
@@ -1250,7 +1257,7 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
                        mlir::ValueRange lenParams, mlir::Value sourceBox = {},
                        mlir::Type sourceBoxType = {}) const {
     auto loc = box.getLoc();
-    auto boxTy = box.getType().template dyn_cast<fir::BaseBoxType>();
+    auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(box.getType());
     bool useInputType = fir::isPolymorphicType(boxTy) &&
                         !fir::isUnlimitedPolymorphicType(inputType);
     llvm::SmallVector<mlir::Value> typeparams = lenParams;
@@ -1293,8 +1300,8 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
                        mlir::ValueRange lenParams,
                        mlir::Value typeDesc = {}) const {
     auto loc = box.getLoc();
-    auto boxTy = box.getType().dyn_cast<fir::BaseBoxType>();
-    auto inputBoxTy = box.getBox().getType().dyn_cast<fir::BaseBoxType>();
+    auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(box.getType());
+    auto inputBoxTy = mlir::dyn_cast<fir::BaseBoxType>(box.getBox().getType());
     auto inputBoxTyPair = this->getBoxTypePair(inputBoxTy);
     llvm::SmallVector<mlir::Value> typeparams = lenParams;
     if (!box.getSubstr().empty() && fir::hasDynamicSize(boxTy.getEleTy()))
@@ -1343,7 +1350,7 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
     mlir::Type resultTy = llvmBaseObjectType;
     // Fortran is column major, llvm GEP is row major: reverse the indices here.
     for (mlir::Value interiorIndex : llvm::reverse(cstInteriorIndices)) {
-      auto arrayTy = resultTy.dyn_cast<mlir::LLVM::LLVMArrayType>();
+      auto arrayTy = mlir::dyn_cast<mlir::LLVM::LLVMArrayType>(resultTy);
       if (!arrayTy)
         fir::emitFatalError(
             loc,
@@ -1355,7 +1362,7 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
         convertSubcomponentIndices(loc, resultTy, componentIndices, &resultTy);
     gepArgs.append(gepIndices.begin(), gepIndices.end());
     if (substringOffset) {
-      if (auto arrayTy = resultTy.dyn_cast<mlir::LLVM::LLVMArrayType>()) {
+      if (auto arrayTy = mlir::dyn_cast<mlir::LLVM::LLVMArrayType>(resultTy)) {
         gepArgs.push_back(*substringOffset);
         resultTy = arrayTy.getElementType();
       } else {
@@ -1504,18 +1511,18 @@ struct XEmboxOpConversion : public EmboxCommonConversion<fir::cg::XEmboxOp> {
     unsigned constRows = 0;
     mlir::Value ptrOffset = zero;
     mlir::Type memEleTy = fir::dyn_cast_ptrEleTy(xbox.getMemref().getType());
-    assert(memEleTy.isa<fir::SequenceType>());
-    auto seqTy = memEleTy.cast<fir::SequenceType>();
+    assert(mlir::isa<fir::SequenceType>(memEleTy));
+    auto seqTy = mlir::cast<fir::SequenceType>(memEleTy);
     mlir::Type seqEleTy = seqTy.getEleTy();
     // Adjust the element scaling factor if the element is a dependent type.
     if (fir::hasDynamicSize(seqEleTy)) {
-      if (auto charTy = seqEleTy.dyn_cast<fir::CharacterType>()) {
+      if (auto charTy = mlir::dyn_cast<fir::CharacterType>(seqEleTy)) {
         // The GEP pointer type decays to llvm.ptr<i[width]>.
         // The scaling factor is the runtime value of the length.
         assert(!adaptor.getLenParams().empty());
         prevPtrOff = FIROpConversion::integerCast(
             loc, rewriter, i64Ty, adaptor.getLenParams().back());
-      } else if (seqEleTy.isa<fir::RecordType>()) {
+      } else if (mlir::isa<fir::RecordType>(seqEleTy)) {
         // prevPtrOff = ;
         TODO(loc, "generate call to calculate size of PDT");
       } else {
@@ -1540,7 +1547,7 @@ struct XEmboxOpConversion : public EmboxCommonConversion<fir::cg::XEmboxOp> {
     } else if (hasSubstr) {
       // We have a substring. The step value needs to be the number of bytes
       // per CHARACTER element.
-      auto charTy = seqEleTy.cast<fir::CharacterType>();
+      auto charTy = mlir::cast<fir::CharacterType>(seqEleTy);
       if (fir::hasDynamicSize(charTy)) {
         prevDimByteStride =
             getCharacterByteSize(loc, rewriter, charTy, adaptor.getLenParams());
@@ -1589,7 +1596,7 @@ struct XEmboxOpConversion : public EmboxCommonConversion<fir::cg::XEmboxOp> {
         // Lower bound is normalized to 0 for BIND(C) interoperability.
         mlir::Value lb = zero;
         const bool isaPointerOrAllocatable =
-            eleTy.isa<fir::PointerType>() || eleTy.isa<fir::HeapType>();
+            mlir::isa<fir::PointerType, fir::HeapType>(eleTy);
         // Lower bound is defaults to 1 for POINTER, ALLOCATABLE, and
         // denormalized descriptors.
         if (isaPointerOrAllocatable || !normalizedLowerBound(xbox))
@@ -1695,7 +1702,7 @@ struct XReboxOpConversion : public EmboxCommonConversion<fir::cg::XReboxOp> {
     // Create new descriptor and fill its non-shape related data.
     llvm::SmallVector<mlir::Value, 2> lenParams;
     mlir::Type inputEleTy = getInputEleTy(rebox);
-    if (auto charTy = inputEleTy.dyn_cast<fir::CharacterType>()) {
+    if (auto charTy = mlir::dyn_cast<fir::CharacterType>(inputEleTy)) {
       if (charTy.hasConstantLen()) {
         mlir::Value len =
             genConstantIndex(loc, idxTy, rewriter, charTy.getLen());
@@ -1712,15 +1719,15 @@ struct XReboxOpConversion : public EmboxCommonConversion<fir::cg::XReboxOp> {
         }
         lenParams.emplace_back(len);
       }
-    } else if (auto recTy = inputEleTy.dyn_cast<fir::RecordType>()) {
+    } else if (auto recTy = mlir::dyn_cast<fir::RecordType>(inputEleTy)) {
       if (recTy.getNumLenParams() != 0)
         TODO(loc, "reboxing descriptor of derived type with length parameters");
     }
 
     // Rebox on polymorphic entities needs to carry over the dynamic type.
     mlir::Value typeDescAddr;
-    if (inputBoxTyPair.fir.isa<fir::ClassType>() &&
-        rebox.getType().isa<fir::ClassType>())
+    if (mlir::isa<fir::ClassType>(inputBoxTyPair.fir) &&
+        mlir::isa<fir::ClassType>(rebox.getType()))
       typeDescAddr =
           loadTypeDescAddress(loc, inputBoxTyPair, loweredBox, rewriter);
 
@@ -1908,7 +1915,7 @@ private:
   /// Return scalar element type of the input box.
   static mlir::Type getInputEleTy(fir::cg::XReboxOp rebox) {
     auto ty = fir::dyn_cast_ptrOrBoxEleTy(rebox.getBox().getType());
-    if (auto seqTy = ty.dyn_cast<fir::SequenceType>())
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
       return seqTy.getEleTy();
     return ty;
   }
@@ -1936,7 +1943,7 @@ struct ValueOpCommon {
     assert(ty && "type is null");
     const auto end = indices.size();
     for (std::remove_const_t<decltype(end)> i = 0; i < end; ++i) {
-      if (auto seq = ty.dyn_cast<mlir::LLVM::LLVMArrayType>()) {
+      if (auto seq = mlir::dyn_cast<mlir::LLVM::LLVMArrayType>(ty)) {
         const auto dim = getDimension(seq);
         if (dim > 1) {
           auto ub = std::min(i + dim, end);
@@ -1944,7 +1951,7 @@ struct ValueOpCommon {
           i += dim - 1;
         }
         ty = getArrayElementType(seq);
-      } else if (auto st = ty.dyn_cast<mlir::LLVM::LLVMStructType>()) {
+      } else if (auto st = mlir::dyn_cast<mlir::LLVM::LLVMStructType>(ty)) {
         ty = st.getBody()[indices[i]];
       } else {
         llvm_unreachable("index into invalid type");
@@ -1957,13 +1964,13 @@ struct ValueOpCommon {
                  mlir::ArrayAttr arrAttr) {
     llvm::SmallVector<int64_t> indices;
     for (auto i = arrAttr.begin(), e = arrAttr.end(); i != e; ++i) {
-      if (auto intAttr = i->dyn_cast<mlir::IntegerAttr>()) {
+      if (auto intAttr = mlir::dyn_cast<mlir::IntegerAttr>(*i)) {
         indices.push_back(intAttr.getInt());
       } else {
-        auto fieldName = i->cast<mlir::StringAttr>().getValue();
+        auto fieldName = mlir::cast<mlir::StringAttr>(*i).getValue();
         ++i;
-        auto ty = i->cast<mlir::TypeAttr>().getValue();
-        auto index = ty.cast<fir::RecordType>().getFieldIndex(fieldName);
+        auto ty = mlir::cast<mlir::TypeAttr>(*i).getValue();
+        auto index = mlir::cast<fir::RecordType>(ty).getFieldIndex(fieldName);
         indices.push_back(index);
       }
     }
@@ -1973,7 +1980,7 @@ struct ValueOpCommon {
 private:
   static mlir::Type getArrayElementType(mlir::LLVM::LLVMArrayType ty) {
     auto eleTy = ty.getElementType();
-    while (auto arrTy = eleTy.dyn_cast<mlir::LLVM::LLVMArrayType>())
+    while (auto arrTy = mlir::dyn_cast<mlir::LLVM::LLVMArrayType>(eleTy))
       eleTy = arrTy.getElementType();
     return eleTy;
   }
@@ -2041,7 +2048,7 @@ struct InsertOnRangeOpConversion
     auto type = adaptor.getOperands()[0].getType();
 
     // Iteratively extract the array dimensions from the type.
-    while (auto t = type.dyn_cast<mlir::LLVM::LLVMArrayType>()) {
+    while (auto t = mlir::dyn_cast<mlir::LLVM::LLVMArrayType>(type)) {
       dims.push_back(t.getNumElements());
       type = t.getElementType();
     }
@@ -2107,7 +2114,8 @@ struct XArrayCoorOpConversion
     mlir::Value offset = genConstantIndex(loc, idxTy, rewriter, 0);
     const bool isShifted = !coor.getShift().empty();
     const bool isSliced = !coor.getSlice().empty();
-    const bool baseIsBoxed = coor.getMemref().getType().isa<fir::BaseBoxType>();
+    const bool baseIsBoxed =
+        mlir::isa<fir::BaseBoxType>(coor.getMemref().getType());
     TypePair baseBoxTyPair =
         baseIsBoxed ? getBoxTypePair(coor.getMemref().getType()) : TypePair{};
     mlir::LLVM::IntegerOverflowFlags nsw =
@@ -2185,7 +2193,8 @@ struct XArrayCoorOpConversion
       // components.
       mlir::Type elementType =
           getLlvmObjectTypeFromBoxType(coor.getMemref().getType());
-      while (auto arrayTy = elementType.dyn_cast<mlir::LLVM::LLVMArrayType>())
+      while (auto arrayTy =
+                 mlir::dyn_cast<mlir::LLVM::LLVMArrayType>(elementType))
         elementType = arrayTy.getElementType();
       args.clear();
       args.push_back(0);
@@ -2275,11 +2284,12 @@ struct CoordinateOpConversion
     }
 
     // Boxed type - get the base pointer from the box
-    if (baseObjectTy.dyn_cast<fir::BaseBoxType>())
+    if (mlir::dyn_cast<fir::BaseBoxType>(baseObjectTy))
       return doRewriteBox(coor, operands, loc, rewriter);
 
     // Reference, pointer or a heap type
-    if (baseObjectTy.isa<fir::ReferenceType, fir::PointerType, fir::HeapType>())
+    if (mlir::isa<fir::ReferenceType, fir::PointerType, fir::HeapType>(
+            baseObjectTy))
       return doRewriteRefOrPtr(coor, llvmObjectTy, operands, loc, rewriter);
 
     return rewriter.notifyMatchFailure(
@@ -2295,7 +2305,7 @@ struct CoordinateOpConversion
   }
 
   static bool hasSubDimensions(mlir::Type type) {
-    return type.isa<fir::SequenceType, fir::RecordType, mlir::TupleType>();
+    return mlir::isa<fir::SequenceType, fir::RecordType, mlir::TupleType>(type);
   }
 
   /// Check whether this form of `!fir.coordinate_of` is supported. These
@@ -2310,14 +2320,14 @@ struct CoordinateOpConversion
     bool ptrEle = false;
     for (; i < numOfCoors; ++i) {
       mlir::Value nxtOpnd = coors[i];
-      if (auto arrTy = type.dyn_cast<fir::SequenceType>()) {
+      if (auto arrTy = mlir::dyn_cast<fir::SequenceType>(type)) {
         subEle = true;
         i += arrTy.getDimension() - 1;
         type = arrTy.getEleTy();
-      } else if (auto recTy = type.dyn_cast<fir::RecordType>()) {
+      } else if (auto recTy = mlir::dyn_cast<fir::RecordType>(type)) {
         subEle = true;
         type = recTy.getType(getFieldNumber(recTy, nxtOpnd));
-      } else if (auto tupTy = type.dyn_cast<mlir::TupleType>()) {
+      } else if (auto tupTy = mlir::dyn_cast<mlir::TupleType>(type)) {
         subEle = true;
         type = tupTy.getType(getConstantIntValue(nxtOpnd));
       } else {
@@ -2335,14 +2345,14 @@ struct CoordinateOpConversion
   static bool arraysHaveKnownShape(mlir::Type type, mlir::ValueRange coors) {
     for (std::size_t i = 0, sz = coors.size(); i < sz; ++i) {
       mlir::Value nxtOpnd = coors[i];
-      if (auto arrTy = type.dyn_cast<fir::SequenceType>()) {
+      if (auto arrTy = mlir::dyn_cast<fir::SequenceType>(type)) {
         if (fir::sequenceWithNonConstantShape(arrTy))
           return false;
         i += arrTy.getDimension() - 1;
         type = arrTy.getEleTy();
-      } else if (auto strTy = type.dyn_cast<fir::RecordType>()) {
+      } else if (auto strTy = mlir::dyn_cast<fir::RecordType>(type)) {
         type = strTy.getType(getFieldNumber(strTy, nxtOpnd));
-      } else if (auto strTy = type.dyn_cast<mlir::TupleType>()) {
+      } else if (auto strTy = mlir::dyn_cast<mlir::TupleType>(type)) {
         type = strTy.getType(getConstantIntValue(nxtOpnd));
       } else {
         return true;
@@ -2357,7 +2367,8 @@ private:
                mlir::Location loc,
                mlir::ConversionPatternRewriter &rewriter) const {
     mlir::Type boxObjTy = coor.getBaseType();
-    assert(boxObjTy.dyn_cast<fir::BaseBoxType>() && "This is not a `fir.box`");
+    assert(mlir::dyn_cast<fir::BaseBoxType>(boxObjTy) &&
+           "This is not a `fir.box`");
     TypePair boxTyPair = getBoxTypePair(boxObjTy);
 
     mlir::Value boxBaseAddr = operands[0];
@@ -2399,7 +2410,7 @@ private:
         mlir::LLVM::IntegerOverflowFlags::nsw;
 
     for (unsigned i = 1, last = operands.size(); i < last; ++i) {
-      if (auto arrTy = cpnTy.dyn_cast<fir::SequenceType>()) {
+      if (auto arrTy = mlir::dyn_cast<fir::SequenceType>(cpnTy)) {
         if (i != 1)
           TODO(loc, "fir.array nested inside other array and/or derived type");
         // Applies byte strides from the box. Ignore lower bound from box
@@ -2421,7 +2432,7 @@ private:
             llvm::ArrayRef<mlir::LLVM::GEPArg>{off});
         i += arrTy.getDimension() - 1;
         cpnTy = arrTy.getEleTy();
-      } else if (auto recTy = cpnTy.dyn_cast<fir::RecordType>()) {
+      } else if (auto recTy = mlir::dyn_cast<fir::RecordType>(cpnTy)) {
         mlir::Value nxtOpnd = operands[i];
         cpnTy = recTy.getType(getFieldNumber(recTy, nxtOpnd));
         auto llvmRecTy = lowerTy().convertType(recTy);
@@ -2456,7 +2467,7 @@ private:
 
     // If only the column is `?`, then we can simply place the column value in
     // the 0-th GEP position.
-    if (auto arrTy = cpnTy.dyn_cast<fir::SequenceType>()) {
+    if (auto arrTy = mlir::dyn_cast<fir::SequenceType>(cpnTy)) {
       if (!hasKnownShape) {
         const unsigned sz = arrTy.getDimension();
         if (arraysHaveKnownShape(arrTy.getEleTy(),
@@ -2500,29 +2511,29 @@ private:
             dims = dimsLeft - 1;
             continue;
           }
-          cpnTy = cpnTy.cast<fir::SequenceType>().getEleTy();
+          cpnTy = mlir::cast<fir::SequenceType>(cpnTy).getEleTy();
           // append array range in reverse (FIR arrays are column-major)
           offs.append(arrIdx.rbegin(), arrIdx.rend());
           arrIdx.clear();
           dims.reset();
           continue;
         }
-        if (auto arrTy = cpnTy.dyn_cast<fir::SequenceType>()) {
+        if (auto arrTy = mlir::dyn_cast<fir::SequenceType>(cpnTy)) {
           int d = arrTy.getDimension() - 1;
           if (d > 0) {
             dims = d;
             arrIdx.push_back(nxtOpnd);
             continue;
           }
-          cpnTy = cpnTy.cast<fir::SequenceType>().getEleTy();
+          cpnTy = mlir::cast<fir::SequenceType>(cpnTy).getEleTy();
           offs.push_back(nxtOpnd);
           continue;
         }
 
         // check if the i-th coordinate relates to a field
-        if (auto recTy = cpnTy.dyn_cast<fir::RecordType>())
+        if (auto recTy = mlir::dyn_cast<fir::RecordType>(cpnTy))
           cpnTy = recTy.getType(getFieldNumber(recTy, nxtOpnd));
-        else if (auto tupTy = cpnTy.dyn_cast<mlir::TupleType>())
+        else if (auto tupTy = mlir::dyn_cast<mlir::TupleType>(cpnTy))
           cpnTy = tupTy.getType(getConstantIntValue(nxtOpnd));
         else
           cpnTy = nullptr;
@@ -2551,7 +2562,7 @@ struct FieldIndexOpConversion : public fir::FIROpConversion<fir::FieldIndexOp> {
   mlir::LogicalResult
   matchAndRewrite(fir::FieldIndexOp field, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
-    auto recTy = field.getOnType().cast<fir::RecordType>();
+    auto recTy = mlir::cast<fir::RecordType>(field.getOnType());
     unsigned index = recTy.getFieldIndex(field.getFieldId());
 
     if (!fir::hasDynamicSize(recTy)) {
@@ -2604,8 +2615,8 @@ struct TypeDescOpConversion : public fir::FIROpConversion<fir::TypeDescOp> {
   matchAndRewrite(fir::TypeDescOp typeDescOp, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
     mlir::Type inTy = typeDescOp.getInType();
-    assert(inTy.isa<fir::RecordType>() && "expecting fir.type");
-    auto recordType = inTy.dyn_cast<fir::RecordType>();
+    assert(mlir::isa<fir::RecordType>(inTy) && "expecting fir.type");
+    auto recordType = mlir::dyn_cast<fir::RecordType>(inTy);
     auto module = typeDescOp.getOperation()->getParentOfType<mlir::ModuleOp>();
     std::string typeDescName =
         fir::NameUniquer::getTypeDescriptorName(recordType.getName());
@@ -2732,7 +2743,7 @@ struct GlobalOpConversion : public fir::FIROpConversion<fir::GlobalOp> {
           mlir::Type vecType = mlir::VectorType::get(
               insertOp.getType().getShape(), constant.getType());
           auto denseAttr = mlir::DenseElementsAttr::get(
-              vecType.cast<mlir::ShapedType>(), constant.getValue());
+              mlir::cast<mlir::ShapedType>(vecType), constant.getValue());
           rewriter.setInsertionPointAfter(insertOp);
           rewriter.replaceOpWithNewOp<mlir::arith::ConstantOp>(
               insertOp, seqTyAttr, denseAttr);
@@ -2808,7 +2819,7 @@ struct LoadOpConversion : public fir::FIROpConversion<fir::LoadOp> {
   matchAndRewrite(fir::LoadOp load, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
     mlir::Type llvmLoadTy = convertObjectType(load.getType());
-    if (auto boxTy = load.getType().dyn_cast<fir::BaseBoxType>()) {
+    if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(load.getType())) {
       // fir.box is a special case because it is considered as an ssa values in
       // fir, but it is lowered as a pointer to a descriptor. So
       // fir.ref<fir.box> and fir.box end up being the same llvm types and
@@ -2921,7 +2932,7 @@ struct SelectCaseOpConversion : public fir::FIROpConversion<fir::SelectCaseOp> {
     llvm::ArrayRef<mlir::Attribute> cases = caseOp.getCases().getValue();
     // Type can be CHARACTER, INTEGER, or LOGICAL (C1145)
     auto ty = caseOp.getSelector().getType();
-    if (ty.isa<fir::CharacterType>()) {
+    if (mlir::isa<fir::CharacterType>(ty)) {
       TODO(caseOp.getLoc(), "fir.select_case codegen with character type");
       return mlir::failure();
     }
@@ -2935,25 +2946,25 @@ struct SelectCaseOpConversion : public fir::FIROpConversion<fir::SelectCaseOp> {
           *caseOp.getCompareOperands(adaptor.getOperands(), t);
       mlir::Value caseArg = *(cmpOps.value().begin());
       mlir::Attribute attr = cases[t];
-      if (attr.isa<fir::PointIntervalAttr>()) {
+      if (mlir::isa<fir::PointIntervalAttr>(attr)) {
         auto cmp = rewriter.create<mlir::LLVM::ICmpOp>(
             loc, mlir::LLVM::ICmpPredicate::eq, selector, caseArg);
         genCaseLadderStep(loc, cmp, dest, destOps, rewriter);
         continue;
       }
-      if (attr.isa<fir::LowerBoundAttr>()) {
+      if (mlir::isa<fir::LowerBoundAttr>(attr)) {
         auto cmp = rewriter.create<mlir::LLVM::ICmpOp>(
             loc, mlir::LLVM::ICmpPredicate::sle, caseArg, selector);
         genCaseLadderStep(loc, cmp, dest, destOps, rewriter);
         continue;
       }
-      if (attr.isa<fir::UpperBoundAttr>()) {
+      if (mlir::isa<fir::UpperBoundAttr>(attr)) {
         auto cmp = rewriter.create<mlir::LLVM::ICmpOp>(
             loc, mlir::LLVM::ICmpPredicate::sle, selector, caseArg);
         genCaseLadderStep(loc, cmp, dest, destOps, rewriter);
         continue;
       }
-      if (attr.isa<fir::ClosedIntervalAttr>()) {
+      if (mlir::isa<fir::ClosedIntervalAttr>(attr)) {
         auto cmp = rewriter.create<mlir::LLVM::ICmpOp>(
             loc, mlir::LLVM::ICmpPredicate::sle, caseArg, selector);
         auto *thisBlock = rewriter.getInsertionBlock();
@@ -2969,7 +2980,7 @@ struct SelectCaseOpConversion : public fir::FIROpConversion<fir::SelectCaseOp> {
         rewriter.setInsertionPointToEnd(newBlock2);
         continue;
       }
-      assert(attr.isa<mlir::UnitAttr>());
+      assert(mlir::isa<mlir::UnitAttr>(attr));
       assert((t + 1 == conds) && "unit must be last");
       genBrOp(caseOp, dest, destOps, rewriter);
     }
@@ -2997,13 +3008,13 @@ static void selectMatchAndRewrite(const fir::LLVMTypeConverter &lowering,
     mlir::Block *dest = select.getSuccessor(t);
     auto destOps = select.getSuccessorOperands(adaptor.getOperands(), t);
     const mlir::Attribute &attr = cases[t];
-    if (auto intAttr = attr.template dyn_cast<mlir::IntegerAttr>()) {
+    if (auto intAttr = mlir::dyn_cast<mlir::IntegerAttr>(attr)) {
       destinations.push_back(dest);
       destinationsOperands.push_back(destOps ? *destOps : mlir::ValueRange{});
       caseValues.push_back(intAttr.getInt());
       continue;
     }
-    assert(attr.template dyn_cast_or_null<mlir::UnitAttr>());
+    assert(mlir::dyn_cast_or_null<mlir::UnitAttr>(attr));
     assert((t + 1 == conds) && "unit must be last");
     defaultDestination = dest;
     defaultOperands = destOps ? *destOps : mlir::ValueRange{};
@@ -3071,7 +3082,7 @@ struct StoreOpConversion : public fir::FIROpConversion<fir::StoreOp> {
     mlir::Location loc = store.getLoc();
     mlir::Type storeTy = store.getValue().getType();
     mlir::LLVM::StoreOp newStoreOp;
-    if (auto boxTy = storeTy.dyn_cast<fir::BaseBoxType>()) {
+    if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(storeTy)) {
       // fir.box value is actually in memory, load it first before storing it.
       mlir::Type llvmBoxTy = lowerTy().convertBoxTypeAsStruct(boxTy);
       auto val = rewriter.create<mlir::LLVM::LoadOp>(loc, llvmBoxTy,
@@ -3186,9 +3197,9 @@ struct IsPresentOpConversion : public fir::FIROpConversion<fir::IsPresentOp> {
     mlir::Location loc = isPresent.getLoc();
     auto ptr = adaptor.getOperands()[0];
 
-    if (isPresent.getVal().getType().isa<fir::BoxCharType>()) {
+    if (mlir::isa<fir::BoxCharType>(isPresent.getVal().getType())) {
       [[maybe_unused]] auto structTy =
-          ptr.getType().cast<mlir::LLVM::LLVMStructType>();
+          mlir::cast<mlir::LLVM::LLVMStructType>(ptr.getType());
       assert(!structTy.isOpaque() && !structTy.getBody().empty());
 
       ptr = rewriter.create<mlir::LLVM::ExtractValueOp>(loc, ptr, 0);
@@ -3214,8 +3225,8 @@ struct AbsentOpConversion : public fir::FIROpConversion<fir::AbsentOp> {
     mlir::Type ty = convertType(absent.getType());
     mlir::Location loc = absent.getLoc();
 
-    if (absent.getType().isa<fir::BoxCharType>()) {
-      auto structTy = ty.cast<mlir::LLVM::LLVMStructType>();
+    if (mlir::isa<fir::BoxCharType>(absent.getType())) {
+      auto structTy = mlir::cast<mlir::LLVM::LLVMStructType>(ty);
       assert(!structTy.isOpaque() && !structTy.getBody().empty());
       auto undefStruct = rewriter.create<mlir::LLVM::UndefOp>(loc, ty);
       auto nullField =
diff --git a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
index 26871d888815..d6dac4998fdc 100644
--- a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
+++ b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
@@ -20,7 +20,7 @@ static inline mlir::Type getLlvmPtrType(mlir::MLIRContext *context,
 }
 
 static unsigned getTypeDescFieldId(mlir::Type ty) {
-  auto isArray = fir::dyn_cast_ptrOrBoxEleTy(ty).isa<fir::SequenceType>();
+  auto isArray = mlir::isa<fir::SequenceType>(fir::dyn_cast_ptrOrBoxEleTy(ty));
   return isArray ? kOptTypePtrPosInBox : kDimsPosInBox;
 }
 
@@ -37,7 +37,7 @@ ConvertFIRToLLVMPattern::ConvertFIRToLLVMPattern(
 // reference.
 mlir::Type
 ConvertFIRToLLVMPattern::convertObjectType(mlir::Type firType) const {
-  if (auto boxTy = firType.dyn_cast<fir::BaseBoxType>())
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(firType))
     return lowerTy().convertBoxTypeAsStruct(boxTy);
   return lowerTy().convertType(firType);
 }
@@ -69,7 +69,7 @@ ConvertFIRToLLVMPattern::integerCast(mlir::Location loc,
   auto valTy = val.getType();
   // If the value was not yet lowered, lower its type so that it can
   // be used in getPrimitiveTypeSizeInBits.
-  if (!valTy.isa<mlir::IntegerType>())
+  if (!mlir::isa<mlir::IntegerType>(valTy))
     valTy = convertType(valTy);
   auto toSize = mlir::LLVM::getPrimitiveTypeSizeInBits(ty);
   auto fromSize = mlir::LLVM::getPrimitiveTypeSizeInBits(valTy);
@@ -91,7 +91,7 @@ ConvertFIRToLLVMPattern::getBoxTypePair(mlir::Type firBoxTy) const {
 mlir::Value ConvertFIRToLLVMPattern::getValueFromBox(
     mlir::Location loc, TypePair boxTy, mlir::Value box, mlir::Type resultTy,
     mlir::ConversionPatternRewriter &rewriter, int boxValue) const {
-  if (box.getType().isa<mlir::LLVM::LLVMPointerType>()) {
+  if (mlir::isa<mlir::LLVM::LLVMPointerType>(box.getType())) {
     auto pty = getLlvmPtrType(resultTy.getContext());
     auto p = rewriter.create<mlir::LLVM::GEPOp>(
         loc, pty, boxTy.llvm, box,
@@ -133,7 +133,7 @@ llvm::SmallVector<mlir::Value, 3> ConvertFIRToLLVMPattern::getDimsFromBox(
 mlir::Value ConvertFIRToLLVMPattern::loadDimFieldFromBox(
     mlir::Location loc, TypePair boxTy, mlir::Value box, mlir::Value dim,
     int off, mlir::Type ty, mlir::ConversionPatternRewriter &rewriter) const {
-  assert(box.getType().isa<mlir::LLVM::LLVMPointerType>() &&
+  assert(mlir::isa<mlir::LLVM::LLVMPointerType>(box.getType()) &&
          "descriptor inquiry with runtime dim can only be done on descriptor "
          "in memory");
   mlir::LLVM::GEPOp p = genGEP(loc, boxTy.llvm, rewriter, box, 0,
@@ -146,7 +146,7 @@ mlir::Value ConvertFIRToLLVMPattern::loadDimFieldFromBox(
 mlir::Value ConvertFIRToLLVMPattern::getDimFieldFromBox(
     mlir::Location loc, TypePair boxTy, mlir::Value box, int dim, int off,
     mlir::Type ty, mlir::ConversionPatternRewriter &rewriter) const {
-  if (box.getType().isa<mlir::LLVM::LLVMPointerType>()) {
+  if (mlir::isa<mlir::LLVM::LLVMPointerType>(box.getType())) {
     mlir::LLVM::GEPOp p = genGEP(loc, boxTy.llvm, rewriter, box, 0,
                                  static_cast<int>(kDimsPosInBox), dim, off);
     auto loadOp = rewriter.create<mlir::LLVM::LoadOp>(loc, ty, p);
@@ -184,12 +184,12 @@ mlir::Value ConvertFIRToLLVMPattern::getElementSizeFromBox(
 mlir::Type ConvertFIRToLLVMPattern::getBoxEleTy(
     mlir::Type type, llvm::ArrayRef<std::int64_t> indexes) const {
   for (unsigned i : indexes) {
-    if (auto t = type.dyn_cast<mlir::LLVM::LLVMStructType>()) {
+    if (auto t = mlir::dyn_cast<mlir::LLVM::LLVMStructType>(type)) {
       assert(!t.isOpaque() && i < t.getBody().size());
       type = t.getBody()[i];
-    } else if (auto t = type.dyn_cast<mlir::LLVM::LLVMArrayType>()) {
+    } else if (auto t = mlir::dyn_cast<mlir::LLVM::LLVMArrayType>(type)) {
       type = t.getElementType();
-    } else if (auto t = type.dyn_cast<mlir::VectorType>()) {
+    } else if (auto t = mlir::dyn_cast<mlir::VectorType>(type)) {
       type = t.getElementType();
     } else {
       fir::emitFatalError(mlir::UnknownLoc::get(type.getContext()),
@@ -243,6 +243,9 @@ ConvertFIRToLLVMPattern::getBlockForAllocaInsert(mlir::Operation *op) const {
     return iface.getAllocaBlock();
   if (auto llvmFuncOp = mlir::dyn_cast<mlir::LLVM::LLVMFuncOp>(op))
     return &llvmFuncOp.front();
+  if (auto ompPrivateOp = mlir::dyn_cast<mlir::omp::PrivateClauseOp>(op))
+    return &ompPrivateOp.getAllocRegion().front();
+
   return getBlockForAllocaInsert(op->getParentOp());
 }
 
diff --git a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
index 665bf09b8fc3..ce7ee22d5d77 100644
--- a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
+++ b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
@@ -86,10 +86,10 @@ public:
     // If the embox does not include a shape, then do not convert it
     if (auto shapeVal = embox.getShape())
       return rewriteDynamicShape(embox, rewriter, shapeVal);
-    if (embox.getType().isa<fir::ClassType>())
+    if (mlir::isa<fir::ClassType>(embox.getType()))
       TODO(embox.getLoc(), "embox conversion for fir.class type");
-    if (auto boxTy = embox.getType().dyn_cast<fir::BoxType>())
-      if (auto seqTy = boxTy.getEleTy().dyn_cast<fir::SequenceType>())
+    if (auto boxTy = mlir::dyn_cast<fir::BoxType>(embox.getType()))
+      if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(boxTy.getEleTy()))
         if (!seqTy.hasDynamicExtents())
           return rewriteStaticShape(embox, rewriter, seqTy);
     return mlir::failure();
@@ -294,10 +294,9 @@ public:
     target.addIllegalOp<fir::ReboxOp>();
     target.addIllegalOp<fir::DeclareOp>();
     target.addDynamicallyLegalOp<fir::EmboxOp>([](fir::EmboxOp embox) {
-      return !(embox.getShape() || embox.getType()
-                                       .cast<fir::BaseBoxType>()
-                                       .getEleTy()
-                                       .isa<fir::SequenceType>());
+      return !(embox.getShape() ||
+               mlir::isa<fir::SequenceType>(
+                   mlir::cast<fir::BaseBoxType>(embox.getType()).getEleTy()));
     });
     mlir::RewritePatternSet patterns(&context);
     fir::populatePreCGRewritePatterns(patterns);
diff --git a/flang/lib/Optimizer/CodeGen/TBAABuilder.cpp b/flang/lib/Optimizer/CodeGen/TBAABuilder.cpp
index b1b0e9b766a6..a21384e8d594 100644
--- a/flang/lib/Optimizer/CodeGen/TBAABuilder.cpp
+++ b/flang/lib/Optimizer/CodeGen/TBAABuilder.cpp
@@ -120,7 +120,7 @@ void TBAABuilder::attachTBAATag(AliasAnalysisOpInterface op, Type baseFIRType,
     // with both data and descriptor accesses.
     // Conservatively set any-access tag if there is any descriptor member.
     tbaaTagSym = getAnyAccessTag(func);
-  } else if (baseFIRType.isa<fir::BaseBoxType>()) {
+  } else if (mlir::isa<fir::BaseBoxType>(baseFIRType)) {
     tbaaTagSym = getBoxAccessTag(baseFIRType, accessFIRType, gep, func);
   } else {
     tbaaTagSym = getDataAccessTag(baseFIRType, accessFIRType, gep, func);
diff --git a/flang/lib/Optimizer/CodeGen/Target.cpp b/flang/lib/Optimizer/CodeGen/Target.cpp
index cea7a1f97f41..652e2bddc1b8 100644
--- a/flang/lib/Optimizer/CodeGen/Target.cpp
+++ b/flang/lib/Optimizer/CodeGen/Target.cpp
@@ -41,9 +41,9 @@ llvm::StringRef Attributes::getIntExtensionAttrName() const {
 static const llvm::fltSemantics &floatToSemantics(const KindMapping &kindMap,
                                                   mlir::Type type) {
   assert(isa_real(type));
-  if (auto ty = type.dyn_cast<fir::RealType>())
+  if (auto ty = mlir::dyn_cast<fir::RealType>(type))
     return kindMap.getFloatSemantics(ty.getFKind());
-  return type.cast<mlir::FloatType>().getFloatSemantics();
+  return mlir::cast<mlir::FloatType>(type).getFloatSemantics();
 }
 
 static void typeTodo(const llvm::fltSemantics *sem, mlir::Location loc,
diff --git a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
index 7bf31ec38695..616de78d0026 100644
--- a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
+++ b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
@@ -137,7 +137,7 @@ public:
         if (!hasPortableSignature(dispatch.getFunctionType(), op))
           convertCallOp(dispatch);
       } else if (auto addr = mlir::dyn_cast<fir::AddrOfOp>(op)) {
-        if (addr.getType().isa<mlir::FunctionType>() &&
+        if (mlir::isa<mlir::FunctionType>(addr.getType()) &&
             !hasPortableSignature(addr.getType(), op))
           convertAddrOp(addr);
       }
@@ -601,7 +601,7 @@ public:
   /// Taking the address of a function. Modify the signature as needed.
   void convertAddrOp(fir::AddrOfOp addrOp) {
     rewriter->setInsertionPoint(addrOp);
-    auto addrTy = addrOp.getType().cast<mlir::FunctionType>();
+    auto addrTy = mlir::cast<mlir::FunctionType>(addrOp.getType());
     fir::CodeGenSpecifics::Marshalling newInTyAndAttrs;
     llvm::SmallVector<mlir::Type> newResTys;
     auto loc = addrOp.getLoc();
@@ -705,22 +705,23 @@ public:
   /// return `true`. Otherwise, the signature is not portable and `false` is
   /// returned.
   bool hasPortableSignature(mlir::Type signature, mlir::Operation *op) {
-    assert(signature.isa<mlir::FunctionType>());
-    auto func = signature.dyn_cast<mlir::FunctionType>();
+    assert(mlir::isa<mlir::FunctionType>(signature));
+    auto func = mlir::dyn_cast<mlir::FunctionType>(signature);
     bool hasCCallingConv = isFuncWithCCallingConvention(op);
     for (auto ty : func.getResults())
-      if ((ty.isa<fir::BoxCharType>() && !noCharacterConversion) ||
+      if ((mlir::isa<fir::BoxCharType>(ty) && !noCharacterConversion) ||
           (fir::isa_complex(ty) && !noComplexConversion) ||
-          (ty.isa<mlir::IntegerType>() && hasCCallingConv)) {
+          (mlir::isa<mlir::IntegerType>(ty) && hasCCallingConv)) {
         LLVM_DEBUG(llvm::dbgs() << "rewrite " << signature << " for target\n");
         return false;
       }
     for (auto ty : func.getInputs())
-      if (((ty.isa<fir::BoxCharType>() || fir::isCharacterProcedureTuple(ty)) &&
+      if (((mlir::isa<fir::BoxCharType>(ty) ||
+            fir::isCharacterProcedureTuple(ty)) &&
            !noCharacterConversion) ||
           (fir::isa_complex(ty) && !noComplexConversion) ||
-          (ty.isa<mlir::IntegerType>() && hasCCallingConv) ||
-          (ty.isa<fir::RecordType>() && !noStructConversion)) {
+          (mlir::isa<mlir::IntegerType>(ty) && hasCCallingConv) ||
+          (mlir::isa<fir::RecordType>(ty) && !noStructConversion)) {
         LLVM_DEBUG(llvm::dbgs() << "rewrite " << signature << " for target\n");
         return false;
       }
@@ -740,7 +741,7 @@ public:
   /// Rewrite the signatures and body of the `FuncOp`s in the module for
   /// the immediately subsequent target code gen.
   void convertSignature(mlir::func::FuncOp func) {
-    auto funcTy = func.getFunctionType().cast<mlir::FunctionType>();
+    auto funcTy = mlir::cast<mlir::FunctionType>(func.getFunctionType());
     if (hasPortableSignature(funcTy, func) && !hasHostAssociations(func))
       return;
     llvm::SmallVector<mlir::Type> newResTys;
diff --git a/flang/lib/Optimizer/CodeGen/TypeConverter.cpp b/flang/lib/Optimizer/CodeGen/TypeConverter.cpp
index 8fa423f35806..fb2ec3f0b2f5 100644
--- a/flang/lib/Optimizer/CodeGen/TypeConverter.cpp
+++ b/flang/lib/Optimizer/CodeGen/TypeConverter.cpp
@@ -103,10 +103,10 @@ LLVMTypeConverter::LLVMTypeConverter(mlir::ModuleOp module, bool applyTBAA,
     for (auto mem : tuple.getTypes()) {
       // Prevent fir.box from degenerating to a pointer to a descriptor in the
       // context of a tuple type.
-      if (auto box = mem.dyn_cast<fir::BaseBoxType>())
+      if (auto box = mlir::dyn_cast<fir::BaseBoxType>(mem))
         members.push_back(convertBoxTypeAsStruct(box));
       else
-        members.push_back(convertType(mem).cast<mlir::Type>());
+        members.push_back(mlir::cast<mlir::Type>(convertType(mem)));
     }
     return mlir::LLVM::LLVMStructType::getLiteral(&getContext(), members,
                                                   /*isPacked=*/false);
@@ -181,10 +181,10 @@ std::optional<mlir::LogicalResult> LLVMTypeConverter::convertRecordType(
   for (auto mem : derived.getTypeList()) {
     // Prevent fir.box from degenerating to a pointer to a descriptor in the
     // context of a record type.
-    if (auto box = mem.second.dyn_cast<fir::BaseBoxType>())
+    if (auto box = mlir::dyn_cast<fir::BaseBoxType>(mem.second))
       members.push_back(convertBoxTypeAsStruct(box));
     else
-      members.push_back(convertType(mem.second).cast<mlir::Type>());
+      members.push_back(mlir::cast<mlir::Type>(convertType(mem.second)));
   }
   if (mlir::failed(st.setBody(members, /*isPacked=*/false)))
     return mlir::failure();
@@ -196,7 +196,7 @@ std::optional<mlir::LogicalResult> LLVMTypeConverter::convertRecordType(
 // Extended descriptors are required for derived types.
 bool LLVMTypeConverter::requiresExtendedDesc(mlir::Type boxElementType) const {
   auto eleTy = fir::unwrapSequenceType(boxElementType);
-  return eleTy.isa<fir::RecordType>();
+  return mlir::isa<fir::RecordType>(eleTy);
 }
 
 // This corresponds to the descriptor as defined in ISO_Fortran_binding.h and
@@ -211,7 +211,8 @@ mlir::Type LLVMTypeConverter::convertBoxTypeAsStruct(BaseBoxType box,
     ele = removeIndirection;
   auto eleTy = convertType(ele);
   // base_addr*
-  if (ele.isa<SequenceType>() && eleTy.isa<mlir::LLVM::LLVMPointerType>())
+  if (mlir::isa<SequenceType>(ele) &&
+      mlir::isa<mlir::LLVM::LLVMPointerType>(eleTy))
     dataDescFields.push_back(eleTy);
   else
     dataDescFields.push_back(
@@ -236,7 +237,7 @@ mlir::Type LLVMTypeConverter::convertBoxTypeAsStruct(BaseBoxType box,
       getDescFieldTypeModel<kF18AddendumPosInBox>()(&getContext()));
   // [dims]
   if (rank == unknownRank()) {
-    if (auto seqTy = ele.dyn_cast<SequenceType>())
+    if (auto seqTy = mlir::dyn_cast<SequenceType>(ele))
       rank = seqTy.getDimension();
     else
       rank = 0;
@@ -252,7 +253,8 @@ mlir::Type LLVMTypeConverter::convertBoxTypeAsStruct(BaseBoxType box,
     auto rowTy =
         getExtendedDescFieldTypeModel<kOptRowTypePosInBox>()(&getContext());
     dataDescFields.push_back(mlir::LLVM::LLVMArrayType::get(rowTy, 1));
-    if (auto recTy = fir::unwrapSequenceType(ele).dyn_cast<fir::RecordType>())
+    if (auto recTy =
+            mlir::dyn_cast<fir::RecordType>(fir::unwrapSequenceType(ele)))
       if (recTy.getNumLenParams() > 0) {
         // The descriptor design needs to be clarified regarding the number of
         // length parameters in the addendum. Since it can change for
diff --git a/flang/lib/Optimizer/Dialect/FIRAttr.cpp b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
index e43710f5627e..9ea3a0568f69 100644
--- a/flang/lib/Optimizer/Dialect/FIRAttr.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
@@ -264,23 +264,23 @@ void fir::FortranVariableFlagsAttr::print(mlir::AsmPrinter &printer) const {
 void fir::printFirAttribute(FIROpsDialect *dialect, mlir::Attribute attr,
                             mlir::DialectAsmPrinter &p) {
   auto &os = p.getStream();
-  if (auto exact = attr.dyn_cast<fir::ExactTypeAttr>()) {
+  if (auto exact = mlir::dyn_cast<fir::ExactTypeAttr>(attr)) {
     os << fir::ExactTypeAttr::getAttrName() << '<';
     p.printType(exact.getType());
     os << '>';
-  } else if (auto sub = attr.dyn_cast<fir::SubclassAttr>()) {
+  } else if (auto sub = mlir::dyn_cast<fir::SubclassAttr>(attr)) {
     os << fir::SubclassAttr::getAttrName() << '<';
     p.printType(sub.getType());
     os << '>';
-  } else if (attr.dyn_cast_or_null<fir::PointIntervalAttr>()) {
+  } else if (mlir::dyn_cast_or_null<fir::PointIntervalAttr>(attr)) {
     os << fir::PointIntervalAttr::getAttrName();
-  } else if (attr.dyn_cast_or_null<fir::ClosedIntervalAttr>()) {
+  } else if (mlir::dyn_cast_or_null<fir::ClosedIntervalAttr>(attr)) {
     os << fir::ClosedIntervalAttr::getAttrName();
-  } else if (attr.dyn_cast_or_null<fir::LowerBoundAttr>()) {
+  } else if (mlir::dyn_cast_or_null<fir::LowerBoundAttr>(attr)) {
     os << fir::LowerBoundAttr::getAttrName();
-  } else if (attr.dyn_cast_or_null<fir::UpperBoundAttr>()) {
+  } else if (mlir::dyn_cast_or_null<fir::UpperBoundAttr>(attr)) {
     os << fir::UpperBoundAttr::getAttrName();
-  } else if (auto a = attr.dyn_cast_or_null<fir::RealAttr>()) {
+  } else if (auto a = mlir::dyn_cast_or_null<fir::RealAttr>(attr)) {
     os << fir::RealAttr::getAttrName() << '<' << a.getFKind() << ", i x";
     llvm::SmallString<40> ss;
     a.getValue().bitcastToAPInt().toStringUnsigned(ss, 16);
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 24af94f9b90a..6773d0adced0 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -57,7 +57,7 @@ static void propagateAttributes(mlir::Operation *fromOp,
 static bool verifyInType(mlir::Type inType,
                          llvm::SmallVectorImpl<llvm::StringRef> &visited,
                          unsigned dynamicExtents = 0) {
-  if (auto st = inType.dyn_cast<fir::SequenceType>()) {
+  if (auto st = mlir::dyn_cast<fir::SequenceType>(inType)) {
     auto shape = st.getShape();
     if (shape.size() == 0)
       return true;
@@ -67,7 +67,7 @@ static bool verifyInType(mlir::Type inType,
       if (dynamicExtents-- == 0)
         return true;
     }
-  } else if (auto rt = inType.dyn_cast<fir::RecordType>()) {
+  } else if (auto rt = mlir::dyn_cast<fir::RecordType>(inType)) {
     // don't recurse if we're already visiting this one
     if (llvm::is_contained(visited, rt.getName()))
       return false;
@@ -84,13 +84,13 @@ static bool verifyInType(mlir::Type inType,
 static bool verifyTypeParamCount(mlir::Type inType, unsigned numParams) {
   auto ty = fir::unwrapSequenceType(inType);
   if (numParams > 0) {
-    if (auto recTy = ty.dyn_cast<fir::RecordType>())
+    if (auto recTy = mlir::dyn_cast<fir::RecordType>(ty))
       return numParams != recTy.getNumLenParams();
-    if (auto chrTy = ty.dyn_cast<fir::CharacterType>())
+    if (auto chrTy = mlir::dyn_cast<fir::CharacterType>(ty))
       return !(numParams == 1 && chrTy.hasDynamicLen());
     return true;
   }
-  if (auto chrTy = ty.dyn_cast<fir::CharacterType>())
+  if (auto chrTy = mlir::dyn_cast<fir::CharacterType>(ty))
     return !chrTy.hasConstantLen();
   return false;
 }
@@ -171,13 +171,13 @@ static void printAllocatableOp(mlir::OpAsmPrinter &p, OP &op) {
 /// Create a legal memory reference as return type
 static mlir::Type wrapAllocaResultType(mlir::Type intype) {
   // FIR semantics: memory references to memory references are disallowed
-  if (intype.isa<fir::ReferenceType>())
+  if (mlir::isa<fir::ReferenceType>(intype))
     return {};
   return fir::ReferenceType::get(intype);
 }
 
 mlir::Type fir::AllocaOp::getAllocatedType() {
-  return getType().cast<fir::ReferenceType>().getEleTy();
+  return mlir::cast<fir::ReferenceType>(getType()).getEleTy();
 }
 
 mlir::Type fir::AllocaOp::getRefTy(mlir::Type ty) {
@@ -270,7 +270,7 @@ mlir::LogicalResult fir::AllocaOp::verify() {
   if (verifyTypeParamCount(getInType(), numLenParams()))
     return emitOpError("LEN params do not correspond to type");
   mlir::Type outType = getType();
-  if (!outType.isa<fir::ReferenceType>())
+  if (!mlir::isa<fir::ReferenceType>(outType))
     return emitOpError("must be a !fir.ref type");
   if (fir::isa_unknown_size_box(fir::dyn_cast_ptrEleTy(outType)))
     return emitOpError("cannot allocate !fir.box of unknown rank or type");
@@ -286,14 +286,14 @@ static mlir::Type wrapAllocMemResultType(mlir::Type intype) {
   // Fortran semantics: C852 an entity cannot be both ALLOCATABLE and POINTER
   // 8.5.3 note 1 prohibits ALLOCATABLE procedures as well
   // FIR semantics: one may not allocate a memory reference value
-  if (intype.isa<fir::ReferenceType, fir::HeapType, fir::PointerType,
-                 mlir::FunctionType>())
+  if (mlir::isa<fir::ReferenceType, fir::HeapType, fir::PointerType,
+                mlir::FunctionType>(intype))
     return {};
   return fir::HeapType::get(intype);
 }
 
 mlir::Type fir::AllocMemOp::getAllocatedType() {
-  return getType().cast<fir::HeapType>().getEleTy();
+  return mlir::cast<fir::HeapType>(getType()).getEleTy();
 }
 
 mlir::Type fir::AllocMemOp::getRefTy(mlir::Type ty) {
@@ -348,7 +348,7 @@ mlir::LogicalResult fir::AllocMemOp::verify() {
   if (verifyTypeParamCount(getInType(), numLenParams()))
     return emitOpError("LEN params do not correspond to type");
   mlir::Type outType = getType();
-  if (!outType.dyn_cast<fir::HeapType>())
+  if (!mlir::dyn_cast<fir::HeapType>(outType))
     return emitOpError("must be a !fir.heap type");
   if (fir::isa_unknown_size_box(fir::dyn_cast_ptrEleTy(outType)))
     return emitOpError("cannot allocate !fir.box of unknown rank or type");
@@ -364,13 +364,13 @@ mlir::LogicalResult fir::AllocMemOp::verify() {
 static bool validTypeParams(mlir::Type dynTy, mlir::ValueRange typeParams) {
   dynTy = fir::unwrapAllRefAndSeqType(dynTy);
   // A box value will contain type parameter values itself.
-  if (dynTy.isa<fir::BoxType>())
+  if (mlir::isa<fir::BoxType>(dynTy))
     return typeParams.size() == 0;
   // Derived type must have all type parameters satisfied.
-  if (auto recTy = dynTy.dyn_cast<fir::RecordType>())
+  if (auto recTy = mlir::dyn_cast<fir::RecordType>(dynTy))
     return typeParams.size() == recTy.getNumLenParams();
   // Characters with non-constant LEN must have a type parameter value.
-  if (auto charTy = dynTy.dyn_cast<fir::CharacterType>())
+  if (auto charTy = mlir::dyn_cast<fir::CharacterType>(dynTy))
     if (charTy.hasDynamicLen())
       return typeParams.size() == 1;
   // Otherwise, any type parameters are invalid.
@@ -379,7 +379,7 @@ static bool validTypeParams(mlir::Type dynTy, mlir::ValueRange typeParams) {
 
 mlir::LogicalResult fir::ArrayCoorOp::verify() {
   auto eleTy = fir::dyn_cast_ptrOrBoxEleTy(getMemref().getType());
-  auto arrTy = eleTy.dyn_cast<fir::SequenceType>();
+  auto arrTy = mlir::dyn_cast<fir::SequenceType>(eleTy);
   if (!arrTy)
     return emitOpError("must be a reference to an array");
   auto arrDim = arrTy.getDimension();
@@ -387,14 +387,14 @@ mlir::LogicalResult fir::ArrayCoorOp::verify() {
   if (auto shapeOp = getShape()) {
     auto shapeTy = shapeOp.getType();
     unsigned shapeTyRank = 0;
-    if (auto s = shapeTy.dyn_cast<fir::ShapeType>()) {
+    if (auto s = mlir::dyn_cast<fir::ShapeType>(shapeTy)) {
       shapeTyRank = s.getRank();
-    } else if (auto ss = shapeTy.dyn_cast<fir::ShapeShiftType>()) {
+    } else if (auto ss = mlir::dyn_cast<fir::ShapeShiftType>(shapeTy)) {
       shapeTyRank = ss.getRank();
     } else {
-      auto s = shapeTy.cast<fir::ShiftType>();
+      auto s = mlir::cast<fir::ShiftType>(shapeTy);
       shapeTyRank = s.getRank();
-      if (!getMemref().getType().isa<fir::BaseBoxType>())
+      if (!mlir::isa<fir::BaseBoxType>(getMemref().getType()))
         return emitOpError("shift can only be provided with fir.box memref");
     }
     if (arrDim && arrDim != shapeTyRank)
@@ -407,7 +407,7 @@ mlir::LogicalResult fir::ArrayCoorOp::verify() {
     if (auto sl = mlir::dyn_cast_or_null<fir::SliceOp>(sliceOp.getDefiningOp()))
       if (!sl.getSubstr().empty())
         return emitOpError("array_coor cannot take a slice with substring");
-    if (auto sliceTy = sliceOp.getType().dyn_cast<fir::SliceType>())
+    if (auto sliceTy = mlir::dyn_cast<fir::SliceType>(sliceOp.getType()))
       if (sliceTy.getRank() != arrDim)
         return emitOpError("rank of dimension in slice mismatched");
   }
@@ -422,13 +422,13 @@ mlir::LogicalResult fir::ArrayCoorOp::verify() {
 //===----------------------------------------------------------------------===//
 
 static mlir::Type adjustedElementType(mlir::Type t) {
-  if (auto ty = t.dyn_cast<fir::ReferenceType>()) {
+  if (auto ty = mlir::dyn_cast<fir::ReferenceType>(t)) {
     auto eleTy = ty.getEleTy();
     if (fir::isa_char(eleTy))
       return eleTy;
     if (fir::isa_derived(eleTy))
       return eleTy;
-    if (eleTy.isa<fir::SequenceType>())
+    if (mlir::isa<fir::SequenceType>(eleTy))
       return eleTy;
   }
   return t;
@@ -448,7 +448,7 @@ std::vector<mlir::Value> fir::ArrayLoadOp::getExtents() {
 
 mlir::LogicalResult fir::ArrayLoadOp::verify() {
   auto eleTy = fir::dyn_cast_ptrOrBoxEleTy(getMemref().getType());
-  auto arrTy = eleTy.dyn_cast<fir::SequenceType>();
+  auto arrTy = mlir::dyn_cast<fir::SequenceType>(eleTy);
   if (!arrTy)
     return emitOpError("must be a reference to an array");
   auto arrDim = arrTy.getDimension();
@@ -456,14 +456,14 @@ mlir::LogicalResult fir::ArrayLoadOp::verify() {
   if (auto shapeOp = getShape()) {
     auto shapeTy = shapeOp.getType();
     unsigned shapeTyRank = 0u;
-    if (auto s = shapeTy.dyn_cast<fir::ShapeType>()) {
+    if (auto s = mlir::dyn_cast<fir::ShapeType>(shapeTy)) {
       shapeTyRank = s.getRank();
-    } else if (auto ss = shapeTy.dyn_cast<fir::ShapeShiftType>()) {
+    } else if (auto ss = mlir::dyn_cast<fir::ShapeShiftType>(shapeTy)) {
       shapeTyRank = ss.getRank();
     } else {
-      auto s = shapeTy.cast<fir::ShiftType>();
+      auto s = mlir::cast<fir::ShiftType>(shapeTy);
       shapeTyRank = s.getRank();
-      if (!getMemref().getType().isa<fir::BaseBoxType>())
+      if (!mlir::isa<fir::BaseBoxType>(getMemref().getType()))
         return emitOpError("shift can only be provided with fir.box memref");
     }
     if (arrDim && arrDim != shapeTyRank)
@@ -474,7 +474,7 @@ mlir::LogicalResult fir::ArrayLoadOp::verify() {
     if (auto sl = mlir::dyn_cast_or_null<fir::SliceOp>(sliceOp.getDefiningOp()))
       if (!sl.getSubstr().empty())
         return emitOpError("array_load cannot take a slice with substring");
-    if (auto sliceTy = sliceOp.getType().dyn_cast<fir::SliceType>())
+    if (auto sliceTy = mlir::dyn_cast<fir::SliceType>(sliceOp.getType()))
       if (sliceTy.getRank() != arrDim)
         return emitOpError("rank of dimension in slice mismatched");
   }
@@ -502,7 +502,7 @@ mlir::LogicalResult fir::ArrayMergeStoreOp::verify() {
         // This is an intra-object merge, where the slice is projecting the
         // subfields that are to be overwritten by the merge operation.
         auto eleTy = fir::dyn_cast_ptrOrBoxEleTy(getMemref().getType());
-        if (auto seqTy = eleTy.dyn_cast<fir::SequenceType>()) {
+        if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(eleTy)) {
           auto projTy =
               fir::applyPathToType(seqTy.getEleTy(), sliceOp.getFields());
           if (fir::unwrapSequenceType(getOriginal().getType()) != projTy)
@@ -540,7 +540,7 @@ mlir::Type validArraySubobject(A op) {
 }
 
 mlir::LogicalResult fir::ArrayFetchOp::verify() {
-  auto arrTy = getSequence().getType().cast<fir::SequenceType>();
+  auto arrTy = mlir::cast<fir::SequenceType>(getSequence().getType());
   auto indSize = getIndices().size();
   if (indSize < arrTy.getDimension())
     return emitOpError("number of indices != dimension of array");
@@ -562,7 +562,7 @@ mlir::LogicalResult fir::ArrayFetchOp::verify() {
 //===----------------------------------------------------------------------===//
 
 mlir::LogicalResult fir::ArrayAccessOp::verify() {
-  auto arrTy = getSequence().getType().cast<fir::SequenceType>();
+  auto arrTy = mlir::cast<fir::SequenceType>(getSequence().getType());
   std::size_t indSize = getIndices().size();
   if (indSize < arrTy.getDimension())
     return emitOpError("number of indices != dimension of array");
@@ -584,7 +584,7 @@ mlir::LogicalResult fir::ArrayAccessOp::verify() {
 mlir::LogicalResult fir::ArrayUpdateOp::verify() {
   if (fir::isa_ref_type(getMerge().getType()))
     return emitOpError("does not support reference type for merge");
-  auto arrTy = getSequence().getType().cast<fir::SequenceType>();
+  auto arrTy = mlir::cast<fir::SequenceType>(getSequence().getType());
   auto indSize = getIndices().size();
   if (indSize < arrTy.getDimension())
     return emitOpError("number of indices != dimension of array");
@@ -604,7 +604,7 @@ mlir::LogicalResult fir::ArrayUpdateOp::verify() {
 //===----------------------------------------------------------------------===//
 
 mlir::LogicalResult fir::ArrayModifyOp::verify() {
-  auto arrTy = getSequence().getType().cast<fir::SequenceType>();
+  auto arrTy = mlir::cast<fir::SequenceType>(getSequence().getType());
   auto indSize = getIndices().size();
   if (indSize < arrTy.getDimension())
     return emitOpError("number of indices must match array dimension");
@@ -740,7 +740,7 @@ mlir::ParseResult fir::CallOp::parse(mlir::OpAsmParser &parser,
       parser.parseType(type))
     return mlir::failure();
 
-  auto funcType = type.dyn_cast<mlir::FunctionType>();
+  auto funcType = mlir::dyn_cast<mlir::FunctionType>(type);
   if (!funcType)
     return parser.emitError(parser.getNameLoc(), "expected function type");
   if (isDirect) {
@@ -785,7 +785,7 @@ void fir::CallOp::build(mlir::OpBuilder &builder, mlir::OperationState &result,
 mlir::LogicalResult fir::CharConvertOp::verify() {
   auto unwrap = [&](mlir::Type t) {
     t = fir::unwrapSequenceType(fir::dyn_cast_ptrEleTy(t));
-    return t.dyn_cast<fir::CharacterType>();
+    return mlir::dyn_cast<fir::CharacterType>(t);
   };
   auto inTy = unwrap(getFrom().getType());
   auto outTy = unwrap(getTo().getType());
@@ -832,13 +832,13 @@ static mlir::ParseResult parseCmpOp(mlir::OpAsmParser &parser,
       parser.resolveOperands(ops, type, result.operands))
     return mlir::failure();
 
-  if (!predicateNameAttr.isa<mlir::StringAttr>())
+  if (!mlir::isa<mlir::StringAttr>(predicateNameAttr))
     return parser.emitError(parser.getNameLoc(),
                             "expected string comparison predicate attribute");
 
   // Rewrite string attribute to an enum value.
   llvm::StringRef predicateName =
-      predicateNameAttr.cast<mlir::StringAttr>().getValue();
+      mlir::cast<mlir::StringAttr>(predicateNameAttr).getValue();
   auto predicate = fir::CmpcOp::getPredicateByName(predicateName);
   auto builder = parser.getBuilder();
   mlir::Type i1Type = builder.getI1Type();
@@ -906,7 +906,7 @@ void fir::ConstcOp::print(mlir::OpAsmPrinter &p) {
 }
 
 mlir::LogicalResult fir::ConstcOp::verify() {
-  if (!getType().isa<fir::ComplexType>())
+  if (!mlir::isa<fir::ComplexType>(getType()))
     return emitOpError("must be a !fir.complex type");
   return mlir::success();
 }
@@ -929,15 +929,16 @@ mlir::OpFoldResult fir::ConvertOp::fold(FoldAdaptor adaptor) {
   if (matchPattern(getValue(), mlir::m_Op<fir::ConvertOp>())) {
     auto inner = mlir::cast<fir::ConvertOp>(getValue().getDefiningOp());
     // (convert (convert 'a : logical -> i1) : i1 -> logical) ==> forward 'a
-    if (auto toTy = getType().dyn_cast<fir::LogicalType>())
-      if (auto fromTy = inner.getValue().getType().dyn_cast<fir::LogicalType>())
-        if (inner.getType().isa<mlir::IntegerType>() && (toTy == fromTy))
+    if (auto toTy = mlir::dyn_cast<fir::LogicalType>(getType()))
+      if (auto fromTy =
+              mlir::dyn_cast<fir::LogicalType>(inner.getValue().getType()))
+        if (mlir::isa<mlir::IntegerType>(inner.getType()) && (toTy == fromTy))
           return inner.getValue();
     // (convert (convert 'a : i1 -> logical) : logical -> i1) ==> forward 'a
-    if (auto toTy = getType().dyn_cast<mlir::IntegerType>())
+    if (auto toTy = mlir::dyn_cast<mlir::IntegerType>(getType()))
       if (auto fromTy =
-              inner.getValue().getType().dyn_cast<mlir::IntegerType>())
-        if (inner.getType().isa<fir::LogicalType>() && (toTy == fromTy) &&
+              mlir::dyn_cast<mlir::IntegerType>(inner.getValue().getType()))
+        if (mlir::isa<fir::LogicalType>(inner.getType()) && (toTy == fromTy) &&
             (fromTy.getWidth() == 1))
           return inner.getValue();
   }
@@ -945,7 +946,7 @@ mlir::OpFoldResult fir::ConvertOp::fold(FoldAdaptor adaptor) {
 }
 
 bool fir::ConvertOp::isInteger(mlir::Type ty) {
-  return ty.isa<mlir::IntegerType, mlir::IndexType, fir::IntegerType>();
+  return mlir::isa<mlir::IntegerType, mlir::IndexType, fir::IntegerType>(ty);
 }
 
 bool fir::ConvertOp::isIntegerCompatible(mlir::Type ty) {
@@ -953,13 +954,13 @@ bool fir::ConvertOp::isIntegerCompatible(mlir::Type ty) {
 }
 
 bool fir::ConvertOp::isFloatCompatible(mlir::Type ty) {
-  return ty.isa<mlir::FloatType, fir::RealType>();
+  return mlir::isa<mlir::FloatType, fir::RealType>(ty);
 }
 
 bool fir::ConvertOp::isPointerCompatible(mlir::Type ty) {
-  return ty.isa<fir::ReferenceType, fir::PointerType, fir::HeapType,
-                fir::LLVMPointerType, mlir::MemRefType, mlir::FunctionType,
-                fir::TypeDescType>();
+  return mlir::isa<fir::ReferenceType, fir::PointerType, fir::HeapType,
+                   fir::LLVMPointerType, mlir::MemRefType, mlir::FunctionType,
+                   fir::TypeDescType>(ty);
 }
 
 static std::optional<mlir::Type> getVectorElementType(mlir::Type ty) {
@@ -1026,12 +1027,14 @@ bool fir::ConvertOp::canBeConverted(mlir::Type inType, mlir::Type outType) {
          (isFloatCompatible(inType) && isFloatCompatible(outType)) ||
          (isIntegerCompatible(inType) && isPointerCompatible(outType)) ||
          (isPointerCompatible(inType) && isIntegerCompatible(outType)) ||
-         (inType.isa<fir::BoxType>() && outType.isa<fir::BoxType>()) ||
-         (inType.isa<fir::BoxProcType>() && outType.isa<fir::BoxProcType>()) ||
+         (mlir::isa<fir::BoxType>(inType) &&
+          mlir::isa<fir::BoxType>(outType)) ||
+         (mlir::isa<fir::BoxProcType>(inType) &&
+          mlir::isa<fir::BoxProcType>(outType)) ||
          (fir::isa_complex(inType) && fir::isa_complex(outType)) ||
          (fir::isBoxedRecordType(inType) && fir::isPolymorphicType(outType)) ||
          (fir::isPolymorphicType(inType) && fir::isPolymorphicType(outType)) ||
-         (fir::isPolymorphicType(inType) && outType.isa<BoxType>()) ||
+         (fir::isPolymorphicType(inType) && mlir::isa<BoxType>(outType)) ||
          areVectorsCompatible(inType, outType);
 }
 
@@ -1079,7 +1082,7 @@ mlir::LogicalResult fir::CoordinateOp::verify() {
   const mlir::Type refTy = getRef().getType();
   if (fir::isa_ref_type(refTy)) {
     auto eleTy = fir::dyn_cast_ptrEleTy(refTy);
-    if (auto arrTy = eleTy.dyn_cast<fir::SequenceType>()) {
+    if (auto arrTy = mlir::dyn_cast<fir::SequenceType>(eleTy)) {
       if (arrTy.hasUnknownShape())
         return emitOpError("cannot find coordinate in unknown shape");
       if (arrTy.getConstantRows() < arrTy.getDimension() - 1)
@@ -1094,8 +1097,8 @@ mlir::LogicalResult fir::CoordinateOp::verify() {
   const unsigned numCoors = getCoor().size();
   for (auto coorOperand : llvm::enumerate(getCoor())) {
     auto co = coorOperand.value();
-    if (dimension == 0 && eleTy.isa<fir::SequenceType>()) {
-      dimension = eleTy.cast<fir::SequenceType>().getDimension();
+    if (dimension == 0 && mlir::isa<fir::SequenceType>(eleTy)) {
+      dimension = mlir::cast<fir::SequenceType>(eleTy).getDimension();
       if (dimension == 0)
         return emitOpError("cannot apply to array of unknown rank");
     }
@@ -1104,7 +1107,7 @@ mlir::LogicalResult fir::CoordinateOp::verify() {
         // Recovering a LEN type parameter only makes sense from a boxed
         // value. For a bare reference, the LEN type parameters must be
         // passed as additional arguments to `index`.
-        if (refTy.isa<fir::BoxType>()) {
+        if (mlir::isa<fir::BoxType>(refTy)) {
           if (coorOperand.index() != numCoors - 1)
             return emitOpError("len_param_index must be last argument");
           if (getNumOperands() != 2)
@@ -1117,7 +1120,7 @@ mlir::LogicalResult fir::CoordinateOp::verify() {
       } else if (auto index = mlir::dyn_cast<fir::FieldIndexOp>(defOp)) {
         if (eleTy != index.getOnType())
           emitOpError("field_index type not compatible with reference type");
-        if (auto recTy = eleTy.dyn_cast<fir::RecordType>()) {
+        if (auto recTy = mlir::dyn_cast<fir::RecordType>(eleTy)) {
           eleTy = recTy.getType(index.getFieldName());
           continue;
         }
@@ -1126,21 +1129,21 @@ mlir::LogicalResult fir::CoordinateOp::verify() {
     }
     if (dimension) {
       if (--dimension == 0)
-        eleTy = eleTy.cast<fir::SequenceType>().getEleTy();
+        eleTy = mlir::cast<fir::SequenceType>(eleTy).getEleTy();
     } else {
-      if (auto t = eleTy.dyn_cast<mlir::TupleType>()) {
+      if (auto t = mlir::dyn_cast<mlir::TupleType>(eleTy)) {
         // FIXME: Generally, we don't know which field of the tuple is being
         // referred to unless the operand is a constant. Just assume everything
         // is good in the tuple case for now.
         return mlir::success();
-      } else if (auto t = eleTy.dyn_cast<fir::RecordType>()) {
+      } else if (auto t = mlir::dyn_cast<fir::RecordType>(eleTy)) {
         // FIXME: This is the same as the tuple case.
         return mlir::success();
-      } else if (auto t = eleTy.dyn_cast<fir::ComplexType>()) {
+      } else if (auto t = mlir::dyn_cast<fir::ComplexType>(eleTy)) {
         eleTy = t.getElementType();
-      } else if (auto t = eleTy.dyn_cast<mlir::ComplexType>()) {
+      } else if (auto t = mlir::dyn_cast<mlir::ComplexType>(eleTy)) {
         eleTy = t.getElementType();
-      } else if (auto t = eleTy.dyn_cast<fir::CharacterType>()) {
+      } else if (auto t = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
         if (t.getLen() == fir::CharacterType::singleton())
           return emitOpError("cannot apply to character singleton");
         eleTy = fir::CharacterType::getSingleton(t.getContext(), t.getFKind());
@@ -1216,17 +1219,17 @@ mlir::LogicalResult fir::TypeInfoOp::verify() {
 mlir::LogicalResult fir::EmboxOp::verify() {
   auto eleTy = fir::dyn_cast_ptrEleTy(getMemref().getType());
   bool isArray = false;
-  if (auto seqTy = eleTy.dyn_cast<fir::SequenceType>()) {
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(eleTy)) {
     eleTy = seqTy.getEleTy();
     isArray = true;
   }
   if (hasLenParams()) {
     auto lenPs = numLenParams();
-    if (auto rt = eleTy.dyn_cast<fir::RecordType>()) {
+    if (auto rt = mlir::dyn_cast<fir::RecordType>(eleTy)) {
       if (lenPs != rt.getNumLenParams())
         return emitOpError("number of LEN params does not correspond"
                            " to the !fir.type type");
-    } else if (auto strTy = eleTy.dyn_cast<fir::CharacterType>()) {
+    } else if (auto strTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
       if (strTy.getLen() != fir::CharacterType::unknownLen())
         return emitOpError("CHARACTER already has static LEN");
     } else {
@@ -1240,7 +1243,7 @@ mlir::LogicalResult fir::EmboxOp::verify() {
     return emitOpError("shape must not be provided for a scalar");
   if (getSlice() && !isArray)
     return emitOpError("slice must not be provided for a scalar");
-  if (getSourceBox() && !getResult().getType().isa<fir::ClassType>())
+  if (getSourceBox() && !mlir::isa<fir::ClassType>(getResult().getType()))
     return emitOpError("source_box must be used with fir.class result type");
   return mlir::success();
 }
@@ -1251,7 +1254,7 @@ mlir::LogicalResult fir::EmboxOp::verify() {
 
 mlir::LogicalResult fir::EmboxCharOp::verify() {
   auto eleTy = fir::dyn_cast_ptrEleTy(getMemref().getType());
-  if (!eleTy.dyn_cast_or_null<fir::CharacterType>())
+  if (!mlir::dyn_cast_or_null<fir::CharacterType>(eleTy))
     return mlir::failure();
   return mlir::success();
 }
@@ -1263,8 +1266,8 @@ mlir::LogicalResult fir::EmboxCharOp::verify() {
 mlir::LogicalResult fir::EmboxProcOp::verify() {
   // host bindings (optional) must be a reference to a tuple
   if (auto h = getHost()) {
-    if (auto r = h.getType().dyn_cast<fir::ReferenceType>())
-      if (r.getEleTy().isa<mlir::TupleType>())
+    if (auto r = mlir::dyn_cast<fir::ReferenceType>(h.getType()))
+      if (mlir::isa<mlir::TupleType>(r.getEleTy()))
         return mlir::success();
     return mlir::failure();
   }
@@ -1300,7 +1303,7 @@ void fir::TypeDescOp::print(mlir::OpAsmPrinter &p) {
 
 mlir::LogicalResult fir::TypeDescOp::verify() {
   mlir::Type resultTy = getType();
-  if (auto tdesc = resultTy.dyn_cast<fir::TypeDescType>()) {
+  if (auto tdesc = mlir::dyn_cast<fir::TypeDescType>(resultTy)) {
     if (tdesc.getOfTy() != getInType())
       return emitOpError("wrapped type mismatched");
     return mlir::success();
@@ -1527,7 +1530,7 @@ mlir::ParseResult parseFieldLikeOp(mlir::OpAsmParser &parser,
     return mlir::failure();
   result.addAttribute(fir::FieldIndexOp::getFieldAttrName(),
                       builder.getStringAttr(fieldName));
-  if (!recty.dyn_cast<fir::RecordType>())
+  if (!mlir::dyn_cast<fir::RecordType>(recty))
     return mlir::failure();
   result.addAttribute(fir::FieldIndexOp::getTypeAttrName(),
                       mlir::TypeAttr::get(recty));
@@ -1671,7 +1674,7 @@ mlir::LogicalResult fir::InsertOnRangeOp::verify() {
 //===----------------------------------------------------------------------===//
 
 static bool checkIsIntegerConstant(mlir::Attribute attr, std::int64_t conVal) {
-  if (auto iattr = attr.dyn_cast<mlir::IntegerAttr>())
+  if (auto iattr = mlir::dyn_cast<mlir::IntegerAttr>(attr))
     return iattr.getInt() == conVal;
   return false;
 }
@@ -1690,7 +1693,7 @@ struct UndoComplexPattern : public mlir::RewritePattern {
   matchAndRewrite(mlir::Operation *op,
                   mlir::PatternRewriter &rewriter) const override {
     auto insval = mlir::dyn_cast_or_null<fir::InsertValueOp>(op);
-    if (!insval || !insval.getType().isa<fir::ComplexType>())
+    if (!insval || !mlir::isa<fir::ComplexType>(insval.getType()))
       return mlir::failure();
     auto insval2 = mlir::dyn_cast_or_null<fir::InsertValueOp>(
         insval.getAdt().getDefiningOp());
@@ -1819,7 +1822,7 @@ mlir::ParseResult fir::IterWhileOp::parse(mlir::OpAsmParser &parser,
         parser.parseRParen())
       return mlir::failure();
     // Type list must be "(index, i1)".
-    if (typeList.size() != 2 || !typeList[0].isa<mlir::IndexType>() ||
+    if (typeList.size() != 2 || !mlir::isa<mlir::IndexType>(typeList[0]) ||
         !typeList[1].isSignlessInteger(1))
       return mlir::failure();
     result.addTypes(typeList);
@@ -1873,7 +1876,7 @@ mlir::LogicalResult fir::IterWhileOp::verify() {
   auto opNumResults = getNumResults();
   if (getFinalValue()) {
     // Result type must be "(index, i1, ...)".
-    if (!getResult(0).getType().isa<mlir::IndexType>())
+    if (!mlir::isa<mlir::IndexType>(getResult(0).getType()))
       return emitOpError("result #0 expected to be index");
     if (!getResult(1).getType().isSignlessInteger(1))
       return emitOpError("result #1 expected to be i1");
@@ -2316,7 +2319,7 @@ void fir::DTEntryOp::print(mlir::OpAsmPrinter &p) {
 /// Example: return f32 for !fir.box<!fir.heap<!fir.array<?x?xf32>>.
 static mlir::Type getBoxScalarEleTy(mlir::Type boxTy) {
   auto eleTy = fir::dyn_cast_ptrOrBoxEleTy(boxTy);
-  if (auto seqTy = eleTy.dyn_cast<fir::SequenceType>())
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(eleTy))
     return seqTy.getEleTy();
   return eleTy;
 }
@@ -2324,8 +2327,8 @@ static mlir::Type getBoxScalarEleTy(mlir::Type boxTy) {
 /// Test if \p t1 and \p t2 are compatible character types (if they can
 /// represent the same type at runtime).
 static bool areCompatibleCharacterTypes(mlir::Type t1, mlir::Type t2) {
-  auto c1 = t1.dyn_cast<fir::CharacterType>();
-  auto c2 = t2.dyn_cast<fir::CharacterType>();
+  auto c1 = mlir::dyn_cast<fir::CharacterType>(t1);
+  auto c2 = mlir::dyn_cast<fir::CharacterType>(t2);
   if (!c1 || !c2)
     return false;
   if (c1.hasDynamicLen() || c2.hasDynamicLen())
@@ -2347,10 +2350,10 @@ mlir::LogicalResult fir::ReboxOp::verify() {
 
   if (auto sliceVal = getSlice()) {
     // Slicing case
-    if (sliceVal.getType().cast<fir::SliceType>().getRank() != inputRank)
+    if (mlir::cast<fir::SliceType>(sliceVal.getType()).getRank() != inputRank)
       return emitOpError("slice operand rank must match box operand rank");
     if (auto shapeVal = getShape()) {
-      if (auto shiftTy = shapeVal.getType().dyn_cast<fir::ShiftType>()) {
+      if (auto shiftTy = mlir::dyn_cast<fir::ShiftType>(shapeVal.getType())) {
         if (shiftTy.getRank() != inputRank)
           return emitOpError("shape operand and input box ranks must match "
                              "when there is a slice");
@@ -2370,12 +2373,12 @@ mlir::LogicalResult fir::ReboxOp::verify() {
     unsigned shapeRank = inputRank;
     if (auto shapeVal = getShape()) {
       auto ty = shapeVal.getType();
-      if (auto shapeTy = ty.dyn_cast<fir::ShapeType>()) {
+      if (auto shapeTy = mlir::dyn_cast<fir::ShapeType>(ty)) {
         shapeRank = shapeTy.getRank();
-      } else if (auto shapeShiftTy = ty.dyn_cast<fir::ShapeShiftType>()) {
+      } else if (auto shapeShiftTy = mlir::dyn_cast<fir::ShapeShiftType>(ty)) {
         shapeRank = shapeShiftTy.getRank();
       } else {
-        auto shiftTy = ty.cast<fir::ShiftType>();
+        auto shiftTy = mlir::cast<fir::ShiftType>(ty);
         shapeRank = shiftTy.getRank();
         if (shapeRank != inputRank)
           return emitOpError("shape operand and input box ranks must match "
@@ -2394,11 +2397,13 @@ mlir::LogicalResult fir::ReboxOp::verify() {
     // the types is a character with dynamic length, the other type can be any
     // character type.
     const bool typeCanMismatch =
-        inputEleTy.isa<fir::RecordType>() || outEleTy.isa<mlir::NoneType>() ||
-        (inputEleTy.isa<mlir::NoneType>() && outEleTy.isa<fir::RecordType>()) ||
-        (getSlice() && inputEleTy.isa<fir::CharacterType>()) ||
+        mlir::isa<fir::RecordType>(inputEleTy) ||
+        mlir::isa<mlir::NoneType>(outEleTy) ||
+        (mlir::isa<mlir::NoneType>(inputEleTy) &&
+         mlir::isa<fir::RecordType>(outEleTy)) ||
+        (getSlice() && mlir::isa<fir::CharacterType>(inputEleTy)) ||
         (getSlice() && fir::isa_complex(inputEleTy) &&
-         outEleTy.isa<mlir::FloatType>()) ||
+         mlir::isa<mlir::FloatType>(outEleTy)) ||
         areCompatibleCharacterTypes(inputEleTy, outEleTy);
     if (!typeCanMismatch)
       return emitOpError(
@@ -2435,7 +2440,7 @@ mlir::LogicalResult fir::SaveResultOp::verify() {
   if (fir::isa_unknown_size_box(resultType))
     return emitOpError("cannot save !fir.box of unknown rank or type");
 
-  if (resultType.isa<fir::BoxType>()) {
+  if (mlir::isa<fir::BoxType>(resultType)) {
     if (getShape() || !getTypeparams().empty())
       return emitOpError(
           "must not have shape or length operands if the value is a fir.box");
@@ -2446,14 +2451,14 @@ mlir::LogicalResult fir::SaveResultOp::verify() {
   unsigned shapeTyRank = 0;
   if (auto shapeVal = getShape()) {
     auto shapeTy = shapeVal.getType();
-    if (auto s = shapeTy.dyn_cast<fir::ShapeType>())
+    if (auto s = mlir::dyn_cast<fir::ShapeType>(shapeTy))
       shapeTyRank = s.getRank();
     else
-      shapeTyRank = shapeTy.cast<fir::ShapeShiftType>().getRank();
+      shapeTyRank = mlir::cast<fir::ShapeShiftType>(shapeTy).getRank();
   }
 
   auto eleTy = resultType;
-  if (auto seqTy = resultType.dyn_cast<fir::SequenceType>()) {
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(resultType)) {
     if (seqTy.getDimension() != shapeTyRank)
       emitOpError("shape operand must be provided and have the value rank "
                   "when the value is a fir.array");
@@ -2464,11 +2469,11 @@ mlir::LogicalResult fir::SaveResultOp::verify() {
           "shape operand should only be provided if the value is a fir.array");
   }
 
-  if (auto recTy = eleTy.dyn_cast<fir::RecordType>()) {
+  if (auto recTy = mlir::dyn_cast<fir::RecordType>(eleTy)) {
     if (recTy.getNumLenParams() != getTypeparams().size())
       emitOpError("length parameters number must match with the value type "
                   "length parameters");
-  } else if (auto charTy = eleTy.dyn_cast<fir::CharacterType>()) {
+  } else if (auto charTy = mlir::dyn_cast<fir::CharacterType>(eleTy)) {
     if (getTypeparams().size() > 1)
       emitOpError("no more than one length parameter must be provided for "
                   "character value");
@@ -2493,10 +2498,8 @@ static constexpr llvm::StringRef getTargetOffsetAttr() {
 
 template <typename OpT>
 static mlir::LogicalResult verifyIntegralSwitchTerminator(OpT op) {
-  if (!op.getSelector()
-           .getType()
-           .template isa<mlir::IntegerType, mlir::IndexType,
-                         fir::IntegerType>())
+  if (!mlir::isa<mlir::IntegerType, mlir::IndexType, fir::IntegerType>(
+          op.getSelector().getType()))
     return op.emitOpError("must be an integer");
   auto cases =
       op->template getAttrOfType<mlir::ArrayAttr>(op.getCasesAttr()).getValue();
@@ -2508,7 +2511,7 @@ static mlir::LogicalResult verifyIntegralSwitchTerminator(OpT op) {
   if (op.targetOffsetSize() != count)
     return op.emitOpError("incorrect number of successor operand groups");
   for (decltype(count) i = 0; i != count; ++i) {
-    if (!cases[i].template isa<mlir::IntegerAttr, mlir::UnitAttr>())
+    if (!mlir::isa<mlir::IntegerAttr, mlir::UnitAttr>(cases[i]))
       return op.emitOpError("invalid case alternative");
   }
   return mlir::success();
@@ -2571,7 +2574,7 @@ static void printIntegralSwitchTerminator(OpT op, mlir::OpAsmPrinter &p) {
     if (i)
       p << ", ";
     auto &attr = cases[i];
-    if (auto intAttr = attr.template dyn_cast_or_null<mlir::IntegerAttr>())
+    if (auto intAttr = mlir::dyn_cast_or_null<mlir::IntegerAttr>(attr))
       p << intAttr.getValue();
     else
       p.printAttribute(attr);
@@ -2620,7 +2623,7 @@ getMutableSuccessorOperands(unsigned pos, mlir::MutableOperandRange operands,
       *owner->getAttrDictionary().getNamed(offsetAttr);
   return getSubOperands(
       pos, operands,
-      targetOffsetAttr.getValue().cast<mlir::DenseI32ArrayAttr>(),
+      mlir::cast<mlir::DenseI32ArrayAttr>(targetOffsetAttr.getValue()),
       mlir::MutableOperandRange::OperandSegment(pos, targetOffsetAttr));
 }
 
@@ -2742,9 +2745,9 @@ mlir::ParseResult fir::SelectCaseOp::parse(mlir::OpAsmParser &parser,
         parser.parseComma())
       return mlir::failure();
     attrs.push_back(attr);
-    if (attr.dyn_cast_or_null<mlir::UnitAttr>()) {
+    if (mlir::dyn_cast_or_null<mlir::UnitAttr>(attr)) {
       argOffs.push_back(0);
-    } else if (attr.dyn_cast_or_null<fir::ClosedIntervalAttr>()) {
+    } else if (mlir::dyn_cast_or_null<fir::ClosedIntervalAttr>(attr)) {
       mlir::OpAsmParser::UnresolvedOperand oper1;
       mlir::OpAsmParser::UnresolvedOperand oper2;
       if (parser.parseOperand(oper1) || parser.parseComma() ||
@@ -2806,11 +2809,11 @@ void fir::SelectCaseOp::print(mlir::OpAsmPrinter &p) {
     if (i)
       p << ", ";
     p << cases[i] << ", ";
-    if (!cases[i].isa<mlir::UnitAttr>()) {
+    if (!mlir::isa<mlir::UnitAttr>(cases[i])) {
       auto caseArgs = *getCompareOperands(i);
       p.printOperand(*caseArgs.begin());
       p << ", ";
-      if (cases[i].isa<fir::ClosedIntervalAttr>()) {
+      if (mlir::isa<fir::ClosedIntervalAttr>(cases[i])) {
         p.printOperand(*(++caseArgs.begin()));
         p << ", ";
       }
@@ -2848,10 +2851,10 @@ void fir::SelectCaseOp::build(mlir::OpBuilder &builder,
   llvm::SmallVector<int32_t> operOffs;
   int32_t operSize = 0;
   for (auto attr : compareAttrs) {
-    if (attr.isa<fir::ClosedIntervalAttr>()) {
+    if (mlir::isa<fir::ClosedIntervalAttr>(attr)) {
       operOffs.push_back(2);
       operSize += 2;
-    } else if (attr.isa<mlir::UnitAttr>()) {
+    } else if (mlir::isa<mlir::UnitAttr>(attr)) {
       operOffs.push_back(0);
     } else {
       operOffs.push_back(1);
@@ -2900,10 +2903,10 @@ void fir::SelectCaseOp::build(mlir::OpBuilder &builder,
   llvm::SmallVector<mlir::ValueRange> cmpOpers;
   auto iter = cmpOpList.begin();
   for (auto &attr : compareAttrs) {
-    if (attr.isa<fir::ClosedIntervalAttr>()) {
+    if (mlir::isa<fir::ClosedIntervalAttr>(attr)) {
       cmpOpers.push_back(mlir::ValueRange({iter, iter + 2}));
       iter += 2;
-    } else if (attr.isa<mlir::UnitAttr>()) {
+    } else if (mlir::isa<mlir::UnitAttr>(attr)) {
       cmpOpers.push_back(mlir::ValueRange{});
     } else {
       cmpOpers.push_back(mlir::ValueRange({iter, iter + 1}));
@@ -2915,10 +2918,8 @@ void fir::SelectCaseOp::build(mlir::OpBuilder &builder,
 }
 
 mlir::LogicalResult fir::SelectCaseOp::verify() {
-  if (!getSelector()
-           .getType()
-           .isa<mlir::IntegerType, mlir::IndexType, fir::IntegerType,
-                fir::LogicalType, fir::CharacterType>())
+  if (!mlir::isa<mlir::IntegerType, mlir::IndexType, fir::IntegerType,
+                 fir::LogicalType, fir::CharacterType>(getSelector().getType()))
     return emitOpError("must be an integer, character, or logical");
   auto cases =
       getOperation()->getAttrOfType<mlir::ArrayAttr>(getCasesAttr()).getValue();
@@ -2933,9 +2934,11 @@ mlir::LogicalResult fir::SelectCaseOp::verify() {
     return emitOpError("incorrect number of successor operand groups");
   for (decltype(count) i = 0; i != count; ++i) {
     auto &attr = cases[i];
-    if (!(attr.isa<fir::PointIntervalAttr>() ||
-          attr.isa<fir::LowerBoundAttr>() || attr.isa<fir::UpperBoundAttr>() ||
-          attr.isa<fir::ClosedIntervalAttr>() || attr.isa<mlir::UnitAttr>()))
+    if (!(mlir::isa<fir::PointIntervalAttr>(attr) ||
+          mlir::isa<fir::LowerBoundAttr>(attr) ||
+          mlir::isa<fir::UpperBoundAttr>(attr) ||
+          mlir::isa<fir::ClosedIntervalAttr>(attr) ||
+          mlir::isa<mlir::UnitAttr>(attr)))
       return emitOpError("incorrect select case attribute type");
   }
   return mlir::success();
@@ -3111,14 +3114,14 @@ void fir::SelectTypeOp::print(mlir::OpAsmPrinter &p) {
 }
 
 mlir::LogicalResult fir::SelectTypeOp::verify() {
-  if (!(getSelector().getType().isa<fir::BaseBoxType>()))
+  if (!mlir::isa<fir::BaseBoxType>(getSelector().getType()))
     return emitOpError("must be a fir.class or fir.box type");
-  if (auto boxType = getSelector().getType().dyn_cast<fir::BoxType>())
-    if (!boxType.getEleTy().isa<mlir::NoneType>())
+  if (auto boxType = mlir::dyn_cast<fir::BoxType>(getSelector().getType()))
+    if (!mlir::isa<mlir::NoneType>(boxType.getEleTy()))
       return emitOpError("selector must be polymorphic");
   auto typeGuardAttr = getCases();
   for (unsigned idx = 0; idx < typeGuardAttr.size(); ++idx)
-    if (typeGuardAttr[idx].isa<mlir::UnitAttr>() &&
+    if (mlir::isa<mlir::UnitAttr>(typeGuardAttr[idx]) &&
         idx != typeGuardAttr.size() - 1)
       return emitOpError("default must be the last attribute");
   auto count = getNumDest();
@@ -3129,9 +3132,8 @@ mlir::LogicalResult fir::SelectTypeOp::verify() {
   if (targetOffsetSize() != count)
     return emitOpError("incorrect number of successor operand groups");
   for (unsigned i = 0; i != count; ++i) {
-    if (!(typeGuardAttr[i].isa<fir::ExactTypeAttr>() ||
-          typeGuardAttr[i].isa<fir::SubclassAttr>() ||
-          typeGuardAttr[i].isa<mlir::UnitAttr>()))
+    if (!mlir::isa<fir::ExactTypeAttr, fir::SubclassAttr, mlir::UnitAttr>(
+            typeGuardAttr[i]))
       return emitOpError("invalid type-case alternative");
   }
   return mlir::success();
@@ -3175,7 +3177,7 @@ void fir::SelectTypeOp::build(mlir::OpBuilder &builder,
 
 mlir::LogicalResult fir::ShapeOp::verify() {
   auto size = getExtents().size();
-  auto shapeTy = getType().dyn_cast<fir::ShapeType>();
+  auto shapeTy = mlir::dyn_cast<fir::ShapeType>(getType());
   assert(shapeTy && "must be a shape type");
   if (shapeTy.getRank() != size)
     return emitOpError("shape type rank mismatch");
@@ -3198,7 +3200,7 @@ mlir::LogicalResult fir::ShapeShiftOp::verify() {
     return emitOpError("incorrect number of args");
   if (size % 2 != 0)
     return emitOpError("requires a multiple of 2 args");
-  auto shapeTy = getType().dyn_cast<fir::ShapeShiftType>();
+  auto shapeTy = mlir::dyn_cast<fir::ShapeShiftType>(getType());
   assert(shapeTy && "must be a shape shift type");
   if (shapeTy.getRank() * 2 != size)
     return emitOpError("shape type rank mismatch");
@@ -3211,7 +3213,7 @@ mlir::LogicalResult fir::ShapeShiftOp::verify() {
 
 mlir::LogicalResult fir::ShiftOp::verify() {
   auto size = getOrigins().size();
-  auto shiftTy = getType().dyn_cast<fir::ShiftType>();
+  auto shiftTy = mlir::dyn_cast<fir::ShiftType>(getType());
   assert(shiftTy && "must be a shift type");
   if (shiftTy.getRank() != size)
     return emitOpError("shift type rank mismatch");
@@ -3251,7 +3253,7 @@ mlir::LogicalResult fir::SliceOp::verify() {
     return emitOpError("incorrect number of args for triple");
   if (size % 3 != 0)
     return emitOpError("requires a multiple of 3 args");
-  auto sliceTy = getType().dyn_cast<fir::SliceType>();
+  auto sliceTy = mlir::dyn_cast<fir::SliceType>(getType());
   assert(sliceTy && "must be a slice type");
   if (sliceTy.getRank() * 3 != size)
     return emitOpError("slice type rank mismatch");
@@ -3309,8 +3311,8 @@ void fir::StoreOp::build(mlir::OpBuilder &builder, mlir::OperationState &result,
 //===----------------------------------------------------------------------===//
 
 inline fir::CharacterType::KindTy stringLitOpGetKind(fir::StringLitOp op) {
-  auto eleTy = op.getType().cast<fir::SequenceType>().getEleTy();
-  return eleTy.cast<fir::CharacterType>().getFKind();
+  auto eleTy = mlir::cast<fir::SequenceType>(op.getType()).getEleTy();
+  return mlir::cast<fir::CharacterType>(eleTy).getFKind();
 }
 
 bool fir::StringLitOp::isWideValue() { return stringLitOpGetKind(*this) != 1; }
@@ -3390,13 +3392,13 @@ mlir::ParseResult fir::StringLitOp::parse(mlir::OpAsmParser &parser,
   llvm::SMLoc trailingTypeLoc;
   if (parser.parseAttribute(val, "fake", attrs))
     return mlir::failure();
-  if (auto v = val.dyn_cast<mlir::StringAttr>())
+  if (auto v = mlir::dyn_cast<mlir::StringAttr>(val))
     result.attributes.push_back(
         builder.getNamedAttr(fir::StringLitOp::value(), v));
-  else if (auto v = val.dyn_cast<mlir::DenseElementsAttr>())
+  else if (auto v = mlir::dyn_cast<mlir::DenseElementsAttr>(val))
     result.attributes.push_back(
         builder.getNamedAttr(fir::StringLitOp::xlist(), v));
-  else if (auto v = val.dyn_cast<mlir::ArrayAttr>())
+  else if (auto v = mlir::dyn_cast<mlir::ArrayAttr>(val))
     result.attributes.push_back(
         builder.getNamedAttr(fir::StringLitOp::xlist(), v));
   else
@@ -3409,7 +3411,7 @@ mlir::ParseResult fir::StringLitOp::parse(mlir::OpAsmParser &parser,
       parser.parseRParen() || parser.getCurrentLocation(&trailingTypeLoc) ||
       parser.parseColonType(type))
     return mlir::failure();
-  auto charTy = type.dyn_cast<fir::CharacterType>();
+  auto charTy = mlir::dyn_cast<fir::CharacterType>(type);
   if (!charTy)
     return parser.emitError(trailingTypeLoc, "must have character type");
   type = fir::CharacterType::get(builder.getContext(), charTy.getFKind(),
@@ -3421,19 +3423,19 @@ mlir::ParseResult fir::StringLitOp::parse(mlir::OpAsmParser &parser,
 
 void fir::StringLitOp::print(mlir::OpAsmPrinter &p) {
   p << ' ' << getValue() << '(';
-  p << getSize().cast<mlir::IntegerAttr>().getValue() << ") : ";
+  p << mlir::cast<mlir::IntegerAttr>(getSize()).getValue() << ") : ";
   p.printType(getType());
 }
 
 mlir::LogicalResult fir::StringLitOp::verify() {
-  if (getSize().cast<mlir::IntegerAttr>().getValue().isNegative())
+  if (mlir::cast<mlir::IntegerAttr>(getSize()).getValue().isNegative())
     return emitOpError("size must be non-negative");
   if (auto xl = getOperation()->getAttr(fir::StringLitOp::xlist())) {
-    if (auto xList = xl.dyn_cast<mlir::ArrayAttr>()) {
+    if (auto xList = mlir::dyn_cast<mlir::ArrayAttr>(xl)) {
       for (auto a : xList)
-        if (!a.isa<mlir::IntegerAttr>())
+        if (!mlir::isa<mlir::IntegerAttr>(a))
           return emitOpError("values in initializer must be integers");
-    } else if (xl.isa<mlir::DenseElementsAttr>()) {
+    } else if (mlir::isa<mlir::DenseElementsAttr>(xl)) {
       // do nothing
     } else {
       return emitOpError("has unexpected attribute");
@@ -3448,7 +3450,7 @@ mlir::LogicalResult fir::StringLitOp::verify() {
 
 mlir::LogicalResult fir::UnboxProcOp::verify() {
   if (auto eleTy = fir::dyn_cast_ptrEleTy(getRefTuple().getType()))
-    if (eleTy.isa<mlir::TupleType>())
+    if (mlir::isa<mlir::TupleType>(eleTy))
       return mlir::success();
   return emitOpError("second output argument has bad type");
 }
@@ -3527,7 +3529,7 @@ void fir::IfOp::getEntrySuccessorRegions(
 void fir::IfOp::getRegionInvocationBounds(
     llvm::ArrayRef<mlir::Attribute> operands,
     llvm::SmallVectorImpl<mlir::InvocationBounds> &invocationBounds) {
-  if (auto cond = operands[0].dyn_cast_or_null<mlir::BoolAttr>()) {
+  if (auto cond = mlir::dyn_cast_or_null<mlir::BoolAttr>(operands[0])) {
     // If the condition is known, then one region is known to be executed once
     // and the other zero times.
     invocationBounds.emplace_back(0, cond.getValue() ? 1 : 0);
@@ -3646,8 +3648,8 @@ void fir::BoxOffsetOp::build(mlir::OpBuilder &builder,
 //===----------------------------------------------------------------------===//
 
 mlir::ParseResult fir::isValidCaseAttr(mlir::Attribute attr) {
-  if (attr.isa<mlir::UnitAttr, fir::ClosedIntervalAttr, fir::PointIntervalAttr,
-               fir::LowerBoundAttr, fir::UpperBoundAttr>())
+  if (mlir::isa<mlir::UnitAttr, fir::ClosedIntervalAttr, fir::PointIntervalAttr,
+                fir::LowerBoundAttr, fir::UpperBoundAttr>(attr))
     return mlir::success();
   return mlir::failure();
 }
@@ -3657,9 +3659,9 @@ unsigned fir::getCaseArgumentOffset(llvm::ArrayRef<mlir::Attribute> cases,
   unsigned o = 0;
   for (unsigned i = 0; i < dest; ++i) {
     auto &attr = cases[i];
-    if (!attr.dyn_cast_or_null<mlir::UnitAttr>()) {
+    if (!mlir::dyn_cast_or_null<mlir::UnitAttr>(attr)) {
       ++o;
-      if (attr.dyn_cast_or_null<fir::ClosedIntervalAttr>())
+      if (mlir::dyn_cast_or_null<fir::ClosedIntervalAttr>(attr))
         ++o;
     }
   }
@@ -3722,7 +3724,7 @@ fir::GlobalOp fir::createGlobalOp(mlir::Location loc, mlir::ModuleOp module,
 bool fir::hasHostAssociationArgument(mlir::func::FuncOp func) {
   if (auto allArgAttrs = func.getAllArgAttrs())
     for (auto attr : allArgAttrs)
-      if (auto dict = attr.template dyn_cast_or_null<mlir::DictionaryAttr>())
+      if (auto dict = mlir::dyn_cast_or_null<mlir::DictionaryAttr>(attr))
         if (dict.get(fir::getHostAssocAttrName()))
           return true;
   return false;
@@ -3772,7 +3774,7 @@ valueCheckFirAttributes(mlir::Value value,
   };
   // If this is a fir.box that was loaded, the fir attributes will be on the
   // related fir.ref<fir.box> creation.
-  if (value.getType().isa<fir::BoxType>())
+  if (mlir::isa<fir::BoxType>(value.getType()))
     if (auto definingOp = value.getDefiningOp())
       if (auto loadOp = mlir::dyn_cast<fir::LoadOp>(definingOp))
         value = loadOp.getMemref();
@@ -3837,10 +3839,10 @@ bool fir::anyFuncArgsHaveAttr(mlir::func::FuncOp func, llvm::StringRef attr) {
 std::optional<std::int64_t> fir::getIntIfConstant(mlir::Value value) {
   if (auto *definingOp = value.getDefiningOp()) {
     if (auto cst = mlir::dyn_cast<mlir::arith::ConstantOp>(definingOp))
-      if (auto intAttr = cst.getValue().dyn_cast<mlir::IntegerAttr>())
+      if (auto intAttr = mlir::dyn_cast<mlir::IntegerAttr>(cst.getValue()))
         return intAttr.getInt();
     if (auto llConstOp = mlir::dyn_cast<mlir::LLVM::ConstantOp>(definingOp))
-      if (auto attr = llConstOp.getValue().dyn_cast<mlir::IntegerAttr>())
+      if (auto attr = mlir::dyn_cast<mlir::IntegerAttr>(llConstOp.getValue()))
         return attr.getValue().getSExtValue();
   }
   return {};
@@ -4002,15 +4004,15 @@ mlir::LogicalResult fir::CUDAKernelOp::verify() {
 mlir::LogicalResult fir::CUDAAllocateOp::verify() {
   if (getPinned() && getStream())
     return emitOpError("pinned and stream cannot appears at the same time");
-  if (!fir::unwrapRefType(getBox().getType()).isa<fir::BaseBoxType>())
+  if (!mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(getBox().getType())))
     return emitOpError(
         "expect box to be a reference to a class or box type value");
   if (getSource() &&
-      !fir::unwrapRefType(getSource().getType()).isa<fir::BaseBoxType>())
+      !mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(getSource().getType())))
     return emitOpError(
         "expect source to be a reference to/or a class or box type value");
   if (getErrmsg() &&
-      !fir::unwrapRefType(getErrmsg().getType()).isa<fir::BoxType>())
+      !mlir::isa<fir::BoxType>(fir::unwrapRefType(getErrmsg().getType())))
     return emitOpError(
         "expect errmsg to be a reference to/or a box type value");
   if (getErrmsg() && !getHasStat())
@@ -4019,11 +4021,11 @@ mlir::LogicalResult fir::CUDAAllocateOp::verify() {
 }
 
 mlir::LogicalResult fir::CUDADeallocateOp::verify() {
-  if (!fir::unwrapRefType(getBox().getType()).isa<fir::BaseBoxType>())
+  if (!mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(getBox().getType())))
     return emitOpError(
         "expect box to be a reference to class or box type value");
   if (getErrmsg() &&
-      !fir::unwrapRefType(getErrmsg().getType()).isa<fir::BoxType>())
+      !mlir::isa<fir::BoxType>(fir::unwrapRefType(getErrmsg().getType())))
     return emitOpError(
         "expect errmsg to be a reference to/or a box type value");
   if (getErrmsg() && !getHasStat())
diff --git a/flang/lib/Optimizer/Dialect/FIRType.cpp b/flang/lib/Optimizer/Dialect/FIRType.cpp
index 5c4cad6d2083..d9c387ad950e 100644
--- a/flang/lib/Optimizer/Dialect/FIRType.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRType.cpp
@@ -61,14 +61,13 @@ TYPE parseTypeSingleton(mlir::AsmParser &parser) {
 /// Is `ty` a standard or FIR integer type?
 static bool isaIntegerType(mlir::Type ty) {
   // TODO: why aren't we using isa_integer? investigatation required.
-  return ty.isa<mlir::IntegerType>() || ty.isa<fir::IntegerType>();
+  return mlir::isa<mlir::IntegerType, fir::IntegerType>(ty);
 }
 
 bool verifyRecordMemberType(mlir::Type ty) {
-  return !(ty.isa<BoxCharType>() || ty.isa<ShapeType>() ||
-           ty.isa<ShapeShiftType>() || ty.isa<ShiftType>() ||
-           ty.isa<SliceType>() || ty.isa<FieldType>() || ty.isa<LenType>() ||
-           ty.isa<ReferenceType>() || ty.isa<TypeDescType>());
+  return !mlir::isa<BoxCharType, ShapeType, ShapeShiftType, ShiftType,
+                    SliceType, FieldType, LenType, ReferenceType, TypeDescType>(
+      ty);
 }
 
 bool verifySameLists(llvm::ArrayRef<RecordType::TypePair> a1,
@@ -194,7 +193,7 @@ bool isa_std_type(mlir::Type t) {
 }
 
 bool isa_fir_or_std_type(mlir::Type t) {
-  if (auto funcType = t.dyn_cast<mlir::FunctionType>())
+  if (auto funcType = mlir::dyn_cast<mlir::FunctionType>(t))
     return llvm::all_of(funcType.getInputs(), isa_fir_or_std_type) &&
            llvm::all_of(funcType.getResults(), isa_fir_or_std_type);
   return isa_fir_type(t) || isa_std_type(t);
@@ -203,7 +202,7 @@ bool isa_fir_or_std_type(mlir::Type t) {
 mlir::Type getDerivedType(mlir::Type ty) {
   return llvm::TypeSwitch<mlir::Type, mlir::Type>(ty)
       .Case<fir::PointerType, fir::HeapType, fir::SequenceType>([](auto p) {
-        if (auto seq = p.getEleTy().template dyn_cast<fir::SequenceType>())
+        if (auto seq = mlir::dyn_cast<fir::SequenceType>(p.getEleTy()))
           return seq.getEleTy();
         return p.getEleTy();
       })
@@ -228,12 +227,12 @@ mlir::Type dyn_cast_ptrOrBoxEleTy(mlir::Type t) {
 
 static bool hasDynamicSize(fir::RecordType recTy) {
   for (auto field : recTy.getTypeList()) {
-    if (auto arr = field.second.dyn_cast<fir::SequenceType>()) {
+    if (auto arr = mlir::dyn_cast<fir::SequenceType>(field.second)) {
       if (sequenceWithNonConstantShape(arr))
         return true;
     } else if (characterWithDynamicLen(field.second)) {
       return true;
-    } else if (auto rec = field.second.dyn_cast<fir::RecordType>()) {
+    } else if (auto rec = mlir::dyn_cast<fir::RecordType>(field.second)) {
       if (hasDynamicSize(rec))
         return true;
     }
@@ -242,14 +241,14 @@ static bool hasDynamicSize(fir::RecordType recTy) {
 }
 
 bool hasDynamicSize(mlir::Type t) {
-  if (auto arr = t.dyn_cast<fir::SequenceType>()) {
+  if (auto arr = mlir::dyn_cast<fir::SequenceType>(t)) {
     if (sequenceWithNonConstantShape(arr))
       return true;
     t = arr.getEleTy();
   }
   if (characterWithDynamicLen(t))
     return true;
-  if (auto rec = t.dyn_cast<fir::RecordType>())
+  if (auto rec = mlir::dyn_cast<fir::RecordType>(t))
     return hasDynamicSize(rec);
   return false;
 }
@@ -269,33 +268,33 @@ mlir::Type extractSequenceType(mlir::Type ty) {
 bool isPointerType(mlir::Type ty) {
   if (auto refTy = fir::dyn_cast_ptrEleTy(ty))
     ty = refTy;
-  if (auto boxTy = ty.dyn_cast<fir::BaseBoxType>())
-    return boxTy.getEleTy().isa<fir::PointerType>();
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty))
+    return mlir::isa<fir::PointerType>(boxTy.getEleTy());
   return false;
 }
 
 bool isAllocatableType(mlir::Type ty) {
   if (auto refTy = fir::dyn_cast_ptrEleTy(ty))
     ty = refTy;
-  if (auto boxTy = ty.dyn_cast<fir::BaseBoxType>())
-    return boxTy.getEleTy().isa<fir::HeapType>();
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty))
+    return mlir::isa<fir::HeapType>(boxTy.getEleTy());
   return false;
 }
 
 bool isBoxNone(mlir::Type ty) {
-  if (auto box = ty.dyn_cast<fir::BoxType>())
-    return box.getEleTy().isa<mlir::NoneType>();
+  if (auto box = mlir::dyn_cast<fir::BoxType>(ty))
+    return mlir::isa<mlir::NoneType>(box.getEleTy());
   return false;
 }
 
 bool isBoxedRecordType(mlir::Type ty) {
   if (auto refTy = fir::dyn_cast_ptrEleTy(ty))
     ty = refTy;
-  if (auto boxTy = ty.dyn_cast<fir::BoxType>()) {
-    if (boxTy.getEleTy().isa<fir::RecordType>())
+  if (auto boxTy = mlir::dyn_cast<fir::BoxType>(ty)) {
+    if (mlir::isa<fir::RecordType>(boxTy.getEleTy()))
       return true;
     mlir::Type innerType = boxTy.unwrapInnerType();
-    return innerType && innerType.isa<fir::RecordType>();
+    return innerType && mlir::isa<fir::RecordType>(innerType);
   }
   return false;
 }
@@ -303,13 +302,13 @@ bool isBoxedRecordType(mlir::Type ty) {
 bool isScalarBoxedRecordType(mlir::Type ty) {
   if (auto refTy = fir::dyn_cast_ptrEleTy(ty))
     ty = refTy;
-  if (auto boxTy = ty.dyn_cast<fir::BaseBoxType>()) {
-    if (boxTy.getEleTy().isa<fir::RecordType>())
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty)) {
+    if (mlir::isa<fir::RecordType>(boxTy.getEleTy()))
       return true;
-    if (auto heapTy = boxTy.getEleTy().dyn_cast<fir::HeapType>())
-      return heapTy.getEleTy().isa<fir::RecordType>();
-    if (auto ptrTy = boxTy.getEleTy().dyn_cast<fir::PointerType>())
-      return ptrTy.getEleTy().isa<fir::RecordType>();
+    if (auto heapTy = mlir::dyn_cast<fir::HeapType>(boxTy.getEleTy()))
+      return mlir::isa<fir::RecordType>(heapTy.getEleTy());
+    if (auto ptrTy = mlir::dyn_cast<fir::PointerType>(boxTy.getEleTy()))
+      return mlir::isa<fir::RecordType>(ptrTy.getEleTy());
   }
   return false;
 }
@@ -363,10 +362,10 @@ bool isPolymorphicType(mlir::Type ty) {
 bool isUnlimitedPolymorphicType(mlir::Type ty) {
   // CLASS(*)
   if (auto clTy = mlir::dyn_cast<fir::ClassType>(fir::unwrapRefType(ty))) {
-    if (clTy.getEleTy().isa<mlir::NoneType>())
+    if (mlir::isa<mlir::NoneType>(clTy.getEleTy()))
       return true;
     mlir::Type innerType = clTy.unwrapInnerType();
-    return innerType && innerType.isa<mlir::NoneType>();
+    return innerType && mlir::isa<mlir::NoneType>(innerType);
   }
   // TYPE(*)
   return isAssumedType(ty);
@@ -376,7 +375,7 @@ mlir::Type unwrapInnerType(mlir::Type ty) {
   return llvm::TypeSwitch<mlir::Type, mlir::Type>(ty)
       .Case<fir::PointerType, fir::HeapType, fir::SequenceType>([](auto t) {
         mlir::Type eleTy = t.getEleTy();
-        if (auto seqTy = eleTy.dyn_cast<fir::SequenceType>())
+        if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(eleTy))
           return seqTy.getEleTy();
         return eleTy;
       })
@@ -385,13 +384,14 @@ mlir::Type unwrapInnerType(mlir::Type ty) {
 }
 
 bool isRecordWithAllocatableMember(mlir::Type ty) {
-  if (auto recTy = ty.dyn_cast<fir::RecordType>())
+  if (auto recTy = mlir::dyn_cast<fir::RecordType>(ty))
     for (auto [field, memTy] : recTy.getTypeList()) {
       if (fir::isAllocatableType(memTy))
         return true;
       // A record type cannot recursively include itself as a direct member.
       // There must be an intervening `ptr` type, so recursion is safe here.
-      if (memTy.isa<fir::RecordType>() && isRecordWithAllocatableMember(memTy))
+      if (mlir::isa<fir::RecordType>(memTy) &&
+          isRecordWithAllocatableMember(memTy))
         return true;
     }
   return false;
@@ -399,11 +399,12 @@ bool isRecordWithAllocatableMember(mlir::Type ty) {
 
 bool isRecordWithDescriptorMember(mlir::Type ty) {
   ty = unwrapSequenceType(ty);
-  if (auto recTy = ty.dyn_cast<fir::RecordType>())
+  if (auto recTy = mlir::dyn_cast<fir::RecordType>(ty))
     for (auto [field, memTy] : recTy.getTypeList()) {
       if (mlir::isa<fir::BaseBoxType>(memTy))
         return true;
-      if (memTy.isa<fir::RecordType>() && isRecordWithDescriptorMember(memTy))
+      if (mlir::isa<fir::RecordType>(memTy) &&
+          isRecordWithDescriptorMember(memTy))
         return true;
     }
   return false;
@@ -412,7 +413,7 @@ bool isRecordWithDescriptorMember(mlir::Type ty) {
 mlir::Type unwrapAllRefAndSeqType(mlir::Type ty) {
   while (true) {
     mlir::Type nt = unwrapSequenceType(unwrapRefType(ty));
-    if (auto vecTy = nt.dyn_cast<fir::VectorType>())
+    if (auto vecTy = mlir::dyn_cast<fir::VectorType>(nt))
       nt = vecTy.getEleTy();
     if (nt == ty)
       return ty;
@@ -421,11 +422,11 @@ mlir::Type unwrapAllRefAndSeqType(mlir::Type ty) {
 }
 
 mlir::Type unwrapSeqOrBoxedSeqType(mlir::Type ty) {
-  if (auto seqTy = ty.dyn_cast<fir::SequenceType>())
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
     return seqTy.getEleTy();
-  if (auto boxTy = ty.dyn_cast<fir::BaseBoxType>()) {
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty)) {
     auto eleTy = unwrapRefType(boxTy.getEleTy());
-    if (auto seqTy = eleTy.dyn_cast<fir::SequenceType>())
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(eleTy))
       return seqTy.getEleTy();
   }
   return ty;
@@ -433,7 +434,7 @@ mlir::Type unwrapSeqOrBoxedSeqType(mlir::Type ty) {
 
 unsigned getBoxRank(mlir::Type boxTy) {
   auto eleTy = fir::dyn_cast_ptrOrBoxEleTy(boxTy);
-  if (auto seqTy = eleTy.dyn_cast<fir::SequenceType>())
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(eleTy))
     return seqTy.getDimension();
   return 0;
 }
@@ -441,7 +442,7 @@ unsigned getBoxRank(mlir::Type boxTy) {
 /// Return the ISO_C_BINDING intrinsic module value of type \p ty.
 int getTypeCode(mlir::Type ty, const fir::KindMapping &kindMap) {
   unsigned width = 0;
-  if (mlir::IntegerType intTy = ty.dyn_cast<mlir::IntegerType>()) {
+  if (mlir::IntegerType intTy = mlir::dyn_cast<mlir::IntegerType>(ty)) {
     switch (intTy.getWidth()) {
     case 8:
       return CFI_type_int8_t;
@@ -456,7 +457,7 @@ int getTypeCode(mlir::Type ty, const fir::KindMapping &kindMap) {
     }
     llvm_unreachable("unsupported integer type");
   }
-  if (fir::LogicalType logicalTy = ty.dyn_cast<fir::LogicalType>()) {
+  if (fir::LogicalType logicalTy = mlir::dyn_cast<fir::LogicalType>(ty)) {
     switch (kindMap.getLogicalBitsize(logicalTy.getFKind())) {
     case 8:
       return CFI_type_Bool;
@@ -469,7 +470,7 @@ int getTypeCode(mlir::Type ty, const fir::KindMapping &kindMap) {
     }
     llvm_unreachable("unsupported logical type");
   }
-  if (mlir::FloatType floatTy = ty.dyn_cast<mlir::FloatType>()) {
+  if (mlir::FloatType floatTy = mlir::dyn_cast<mlir::FloatType>(ty)) {
     switch (floatTy.getWidth()) {
     case 16:
       return floatTy.isBF16() ? CFI_type_bfloat : CFI_type_half_float;
@@ -485,13 +486,14 @@ int getTypeCode(mlir::Type ty, const fir::KindMapping &kindMap) {
     llvm_unreachable("unsupported real type");
   }
   if (fir::isa_complex(ty)) {
-    if (mlir::ComplexType complexTy = ty.dyn_cast<mlir::ComplexType>()) {
+    if (mlir::ComplexType complexTy = mlir::dyn_cast<mlir::ComplexType>(ty)) {
       mlir::FloatType floatTy =
-          complexTy.getElementType().cast<mlir::FloatType>();
+          mlir::cast<mlir::FloatType>(complexTy.getElementType());
       if (floatTy.isBF16())
         return CFI_type_bfloat_Complex;
       width = floatTy.getWidth();
-    } else if (fir::ComplexType complexTy = ty.dyn_cast<fir::ComplexType>()) {
+    } else if (fir::ComplexType complexTy =
+                   mlir::dyn_cast<fir::ComplexType>(ty)) {
       auto FKind = complexTy.getFKind();
       if (FKind == 3)
         return CFI_type_bfloat_Complex;
@@ -511,7 +513,7 @@ int getTypeCode(mlir::Type ty, const fir::KindMapping &kindMap) {
     }
     llvm_unreachable("unsupported complex size");
   }
-  if (fir::CharacterType charTy = ty.dyn_cast<fir::CharacterType>()) {
+  if (fir::CharacterType charTy = mlir::dyn_cast<fir::CharacterType>(ty)) {
     switch (kindMap.getCharacterBitsize(charTy.getFKind())) {
     case 8:
       return CFI_type_char;
@@ -524,7 +526,7 @@ int getTypeCode(mlir::Type ty, const fir::KindMapping &kindMap) {
   }
   if (fir::isa_ref_type(ty))
     return CFI_type_cptr;
-  if (ty.isa<fir::RecordType>())
+  if (mlir::isa<fir::RecordType>(ty))
     return CFI_type_struct;
   llvm_unreachable("unsupported type");
 }
@@ -542,12 +544,12 @@ std::string getTypeAsString(mlir::Type ty, const fir::KindMapping &kindMap,
         name << "idx";
       } else if (ty.isIntOrIndex()) {
         name << 'i' << ty.getIntOrFloatBitWidth();
-      } else if (ty.isa<mlir::FloatType>()) {
+      } else if (mlir::isa<mlir::FloatType>(ty)) {
         name << 'f' << ty.getIntOrFloatBitWidth();
       } else if (fir::isa_complex(ty)) {
         name << 'z';
         if (auto cplxTy = mlir::dyn_cast_or_null<mlir::ComplexType>(ty)) {
-          auto floatTy = cplxTy.getElementType().cast<mlir::FloatType>();
+          auto floatTy = mlir::cast<mlir::FloatType>(cplxTy.getElementType());
           name << floatTy.getWidth();
         } else if (auto cplxTy = mlir::dyn_cast_or_null<fir::ComplexType>(ty)) {
           name << kindMap.getRealBitsize(cplxTy.getFKind());
@@ -644,7 +646,7 @@ static llvm::SmallPtrSet<detail::RecordTypeStorage const *, 4>
 } // namespace
 
 void fir::verifyIntegralType(mlir::Type type) {
-  if (isaIntegerType(type) || type.isa<mlir::IndexType>())
+  if (isaIntegerType(type) || mlir::isa<mlir::IndexType>(type))
     return;
   llvm::report_fatal_error("expected integral type");
 }
@@ -656,9 +658,9 @@ void fir::printFirType(FIROpsDialect *, mlir::Type ty,
 }
 
 bool fir::isa_unknown_size_box(mlir::Type t) {
-  if (auto boxTy = t.dyn_cast<fir::BaseBoxType>()) {
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(t)) {
     auto valueType = fir::unwrapPassByRefType(boxTy);
-    if (auto seqTy = valueType.dyn_cast<fir::SequenceType>())
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(valueType))
       if (seqTy.hasUnknownShape())
         return true;
   }
@@ -684,18 +686,18 @@ void fir::BoxProcType::print(mlir::AsmPrinter &printer) const {
 mlir::LogicalResult
 BoxProcType::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
                     mlir::Type eleTy) {
-  if (eleTy.isa<mlir::FunctionType>())
+  if (mlir::isa<mlir::FunctionType>(eleTy))
     return mlir::success();
-  if (auto refTy = eleTy.dyn_cast<ReferenceType>())
-    if (refTy.isa<mlir::FunctionType>())
+  if (auto refTy = mlir::dyn_cast<ReferenceType>(eleTy))
+    if (mlir::isa<mlir::FunctionType>(refTy))
       return mlir::success();
   return emitError() << "invalid type for boxproc" << eleTy << '\n';
 }
 
 static bool cannotBePointerOrHeapElementType(mlir::Type eleTy) {
-  return eleTy.isa<BoxType, BoxCharType, BoxProcType, ShapeType, ShapeShiftType,
+  return mlir::isa<BoxType, BoxCharType, BoxProcType, ShapeType, ShapeShiftType,
                    SliceType, FieldType, LenType, HeapType, PointerType,
-                   ReferenceType, TypeDescType>();
+                   ReferenceType, TypeDescType>(eleTy);
 }
 
 //===----------------------------------------------------------------------===//
@@ -705,7 +707,7 @@ static bool cannotBePointerOrHeapElementType(mlir::Type eleTy) {
 mlir::LogicalResult
 fir::BoxType::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
                      mlir::Type eleTy) {
-  if (eleTy.isa<fir::BaseBoxType>())
+  if (mlir::isa<fir::BaseBoxType>(eleTy))
     return emitError() << "invalid element type\n";
   // TODO
   return mlir::success();
@@ -774,10 +776,10 @@ void fir::CharacterType::print(mlir::AsmPrinter &printer) const {
 mlir::LogicalResult
 fir::ClassType::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
                        mlir::Type eleTy) {
-  if (eleTy.isa<fir::RecordType, fir::SequenceType, fir::HeapType,
+  if (mlir::isa<fir::RecordType, fir::SequenceType, fir::HeapType,
                 fir::PointerType, mlir::NoneType, mlir::IntegerType,
                 mlir::FloatType, fir::CharacterType, fir::LogicalType,
-                fir::ComplexType, mlir::ComplexType>())
+                fir::ComplexType, mlir::ComplexType>(eleTy))
     return mlir::success();
   return emitError() << "invalid element type\n";
 }
@@ -1048,8 +1050,8 @@ void fir::ReferenceType::print(mlir::AsmPrinter &printer) const {
 mlir::LogicalResult fir::ReferenceType::verify(
     llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
     mlir::Type eleTy) {
-  if (eleTy.isa<ShapeType, ShapeShiftType, SliceType, FieldType, LenType,
-                ReferenceType, TypeDescType>())
+  if (mlir::isa<ShapeType, ShapeShiftType, SliceType, FieldType, LenType,
+                ReferenceType, TypeDescType>(eleTy))
     return emitError() << "cannot build a reference to type: " << eleTy << '\n';
   return mlir::success();
 }
@@ -1124,9 +1126,9 @@ mlir::LogicalResult fir::SequenceType::verify(
     llvm::ArrayRef<int64_t> shape, mlir::Type eleTy,
     mlir::AffineMapAttr layoutMap) {
   // DIMENSION attribute can only be applied to an intrinsic or record type
-  if (eleTy.isa<BoxType, BoxCharType, BoxProcType, ShapeType, ShapeShiftType,
+  if (mlir::isa<BoxType, BoxCharType, BoxProcType, ShapeType, ShapeShiftType,
                 ShiftType, SliceType, FieldType, LenType, HeapType, PointerType,
-                ReferenceType, TypeDescType, SequenceType>())
+                ReferenceType, TypeDescType, SequenceType>(eleTy))
     return emitError() << "cannot build an array of this element type: "
                        << eleTy << '\n';
   return mlir::success();
@@ -1197,9 +1199,9 @@ void fir::TypeDescType::print(mlir::AsmPrinter &printer) const {
 mlir::LogicalResult fir::TypeDescType::verify(
     llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
     mlir::Type eleTy) {
-  if (eleTy.isa<BoxType, BoxCharType, BoxProcType, ShapeType, ShapeShiftType,
+  if (mlir::isa<BoxType, BoxCharType, BoxProcType, ShapeType, ShapeShiftType,
                 ShiftType, SliceType, FieldType, LenType, ReferenceType,
-                TypeDescType>())
+                TypeDescType>(eleTy))
     return emitError() << "cannot build a type descriptor of type: " << eleTy
                        << '\n';
   return mlir::success();
@@ -1236,10 +1238,10 @@ bool fir::VectorType::isValidElementType(mlir::Type t) {
 }
 
 bool fir::isCharacterProcedureTuple(mlir::Type ty, bool acceptRawFunc) {
-  mlir::TupleType tuple = ty.dyn_cast<mlir::TupleType>();
+  mlir::TupleType tuple = mlir::dyn_cast<mlir::TupleType>(ty);
   return tuple && tuple.size() == 2 &&
-         (tuple.getType(0).isa<fir::BoxProcType>() ||
-          (acceptRawFunc && tuple.getType(0).isa<mlir::FunctionType>())) &&
+         (mlir::isa<fir::BoxProcType>(tuple.getType(0)) ||
+          (acceptRawFunc && mlir::isa<mlir::FunctionType>(tuple.getType(0)))) &&
          fir::isa_integer(tuple.getType(1));
 }
 
@@ -1247,7 +1249,8 @@ bool fir::hasAbstractResult(mlir::FunctionType ty) {
   if (ty.getNumResults() == 0)
     return false;
   auto resultType = ty.getResult(0);
-  return resultType.isa<fir::SequenceType, fir::BaseBoxType, fir::RecordType>();
+  return mlir::isa<fir::SequenceType, fir::BaseBoxType, fir::RecordType>(
+      resultType);
 }
 
 /// Convert llvm::Type::TypeID to mlir::Type. \p kind is provided for error
diff --git a/flang/lib/Optimizer/Dialect/FortranVariableInterface.cpp b/flang/lib/Optimizer/Dialect/FortranVariableInterface.cpp
index 94f1689dfb05..70b1a2f3d844 100644
--- a/flang/lib/Optimizer/Dialect/FortranVariableInterface.cpp
+++ b/flang/lib/Optimizer/Dialect/FortranVariableInterface.cpp
@@ -18,7 +18,7 @@ mlir::LogicalResult
 fir::FortranVariableOpInterface::verifyDeclareLikeOpImpl(mlir::Value memref) {
   const unsigned numExplicitTypeParams = getExplicitTypeParams().size();
   mlir::Type memType = memref.getType();
-  const bool sourceIsBoxValue = memType.isa<fir::BaseBoxType>();
+  const bool sourceIsBoxValue = mlir::isa<fir::BaseBoxType>(memType);
   const bool sourceIsBoxAddress = fir::isBoxAddress(memType);
   const bool sourceIsBox = sourceIsBoxValue || sourceIsBoxAddress;
   if (isCharacter()) {
@@ -29,7 +29,8 @@ fir::FortranVariableOpInterface::verifyDeclareLikeOpImpl(mlir::Value memref) {
       return emitOpError("must be provided exactly one type parameter when its "
                          "base is a character that is not a box");
 
-  } else if (auto recordType = getElementType().dyn_cast<fir::RecordType>()) {
+  } else if (auto recordType =
+                 mlir::dyn_cast<fir::RecordType>(getElementType())) {
     if (numExplicitTypeParams < recordType.getNumLenParams() && !sourceIsBox)
       return emitOpError("must be provided all the derived type length "
                          "parameters when the base is not a box");
@@ -45,16 +46,16 @@ fir::FortranVariableOpInterface::verifyDeclareLikeOpImpl(mlir::Value memref) {
       if (sourceIsBoxAddress)
         return emitOpError("for box address must not have a shape operand");
       unsigned shapeRank = 0;
-      if (auto shapeType = shape.getType().dyn_cast<fir::ShapeType>()) {
+      if (auto shapeType = mlir::dyn_cast<fir::ShapeType>(shape.getType())) {
         shapeRank = shapeType.getRank();
       } else if (auto shapeShiftType =
-                     shape.getType().dyn_cast<fir::ShapeShiftType>()) {
+                     mlir::dyn_cast<fir::ShapeShiftType>(shape.getType())) {
         shapeRank = shapeShiftType.getRank();
       } else {
         if (!sourceIsBoxValue)
           emitOpError("of array entity with a raw address base must have a "
                       "shape operand that is a shape or shapeshift");
-        shapeRank = shape.getType().cast<fir::ShiftType>().getRank();
+        shapeRank = mlir::cast<fir::ShiftType>(shape.getType()).getRank();
       }
 
       std::optional<unsigned> rank = getRank();
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIRDialect.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIRDialect.cpp
index 08b2b0538c73..0b61c0edce62 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIRDialect.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIRDialect.cpp
@@ -84,7 +84,8 @@ bool hlfir::isFortranVariableType(mlir::Type type) {
   return llvm::TypeSwitch<mlir::Type, bool>(type)
       .Case<fir::ReferenceType, fir::PointerType, fir::HeapType>([](auto p) {
         mlir::Type eleType = p.getEleTy();
-        return eleType.isa<fir::BaseBoxType>() || !fir::hasDynamicSize(eleType);
+        return mlir::isa<fir::BaseBoxType>(eleType) ||
+               !fir::hasDynamicSize(eleType);
       })
       .Case<fir::BaseBoxType, fir::BoxCharType>([](auto) { return true; })
       .Case<fir::VectorType>([](auto) { return true; })
@@ -93,15 +94,15 @@ bool hlfir::isFortranVariableType(mlir::Type type) {
 
 bool hlfir::isFortranScalarCharacterType(mlir::Type type) {
   return isFortranScalarCharacterExprType(type) ||
-         type.isa<fir::BoxCharType>() ||
-         fir::unwrapPassByRefType(fir::unwrapRefType(type))
-             .isa<fir::CharacterType>();
+         mlir::isa<fir::BoxCharType>(type) ||
+         mlir::isa<fir::CharacterType>(
+             fir::unwrapPassByRefType(fir::unwrapRefType(type)));
 }
 
 bool hlfir::isFortranScalarCharacterExprType(mlir::Type type) {
-  if (auto exprType = type.dyn_cast<hlfir::ExprType>())
+  if (auto exprType = mlir::dyn_cast<hlfir::ExprType>(type))
     return exprType.isScalar() &&
-           exprType.getElementType().isa<fir::CharacterType>();
+           mlir::isa<fir::CharacterType>(exprType.getElementType());
   return false;
 }
 
@@ -121,8 +122,8 @@ bool hlfir::isFortranScalarNumericalType(mlir::Type type) {
 bool hlfir::isFortranNumericalArrayObject(mlir::Type type) {
   if (isBoxAddressType(type))
     return false;
-  if (auto arrayTy =
-          getFortranElementOrSequenceType(type).dyn_cast<fir::SequenceType>())
+  if (auto arrayTy = mlir::dyn_cast<fir::SequenceType>(
+          getFortranElementOrSequenceType(type)))
     return isFortranScalarNumericalType(arrayTy.getEleTy());
   return false;
 }
@@ -130,8 +131,8 @@ bool hlfir::isFortranNumericalArrayObject(mlir::Type type) {
 bool hlfir::isFortranNumericalOrLogicalArrayObject(mlir::Type type) {
   if (isBoxAddressType(type))
     return false;
-  if (auto arrayTy =
-          getFortranElementOrSequenceType(type).dyn_cast<fir::SequenceType>()) {
+  if (auto arrayTy = mlir::dyn_cast<fir::SequenceType>(
+          getFortranElementOrSequenceType(type))) {
     mlir::Type eleTy = arrayTy.getEleTy();
     return isFortranScalarNumericalType(eleTy) ||
            mlir::isa<fir::LogicalType>(eleTy);
@@ -142,7 +143,8 @@ bool hlfir::isFortranNumericalOrLogicalArrayObject(mlir::Type type) {
 bool hlfir::isFortranArrayObject(mlir::Type type) {
   if (isBoxAddressType(type))
     return false;
-  return !!getFortranElementOrSequenceType(type).dyn_cast<fir::SequenceType>();
+  return !!mlir::dyn_cast<fir::SequenceType>(
+      getFortranElementOrSequenceType(type));
 }
 
 bool hlfir::isPassByRefOrIntegerType(mlir::Type type) {
@@ -151,7 +153,7 @@ bool hlfir::isPassByRefOrIntegerType(mlir::Type type) {
 }
 
 bool hlfir::isI1Type(mlir::Type type) {
-  if (mlir::IntegerType integer = type.dyn_cast<mlir::IntegerType>())
+  if (mlir::IntegerType integer = mlir::dyn_cast<mlir::IntegerType>(type))
     if (integer.getWidth() == 1)
       return true;
   return false;
@@ -160,8 +162,8 @@ bool hlfir::isI1Type(mlir::Type type) {
 bool hlfir::isFortranLogicalArrayObject(mlir::Type type) {
   if (isBoxAddressType(type))
     return false;
-  if (auto arrayTy =
-          getFortranElementOrSequenceType(type).dyn_cast<fir::SequenceType>()) {
+  if (auto arrayTy = mlir::dyn_cast<fir::SequenceType>(
+          getFortranElementOrSequenceType(type))) {
     mlir::Type eleTy = arrayTy.getEleTy();
     return mlir::isa<fir::LogicalType>(eleTy);
   }
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index 8bad4e445082..0d62ca4954e6 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -74,8 +74,8 @@ getIntrinsicEffects(mlir::Operation *self,
 /// Is this a fir.[ref/ptr/heap]<fir.[box/class]<fir.heap<T>>> type?
 static bool isAllocatableBoxRef(mlir::Type type) {
   fir::BaseBoxType boxType =
-      fir::dyn_cast_ptrEleTy(type).dyn_cast_or_null<fir::BaseBoxType>();
-  return boxType && boxType.getEleTy().isa<fir::HeapType>();
+      mlir::dyn_cast_or_null<fir::BaseBoxType>(fir::dyn_cast_ptrEleTy(type));
+  return boxType && mlir::isa<fir::HeapType>(boxType.getEleTy());
 }
 
 mlir::LogicalResult hlfir::AssignOp::verify() {
@@ -84,7 +84,7 @@ mlir::LogicalResult hlfir::AssignOp::verify() {
     return emitOpError("lhs must be an allocatable when `realloc` is set");
   if (mustKeepLhsLengthInAllocatableAssignment() &&
       !(isAllocatableAssignment() &&
-        hlfir::getFortranElementType(lhsType).isa<fir::CharacterType>()))
+        mlir::isa<fir::CharacterType>(hlfir::getFortranElementType(lhsType))))
     return emitOpError("`realloc` must be set and lhs must be a character "
                        "allocatable when `keep_lhs_length_if_realloc` is set");
   return mlir::success();
@@ -99,13 +99,13 @@ mlir::LogicalResult hlfir::AssignOp::verify() {
 mlir::Type hlfir::DeclareOp::getHLFIRVariableType(mlir::Type inputType,
                                                   bool hasExplicitLowerBounds) {
   mlir::Type type = fir::unwrapRefType(inputType);
-  if (type.isa<fir::BaseBoxType>())
+  if (mlir::isa<fir::BaseBoxType>(type))
     return inputType;
-  if (auto charType = type.dyn_cast<fir::CharacterType>())
+  if (auto charType = mlir::dyn_cast<fir::CharacterType>(type))
     if (charType.hasDynamicLen())
       return fir::BoxCharType::get(charType.getContext(), charType.getFKind());
 
-  auto seqType = type.dyn_cast<fir::SequenceType>();
+  auto seqType = mlir::dyn_cast<fir::SequenceType>(type);
   bool hasDynamicExtents =
       seqType && fir::sequenceWithNonConstantShape(seqType);
   mlir::Type eleType = seqType ? seqType.getEleTy() : type;
@@ -117,7 +117,8 @@ mlir::Type hlfir::DeclareOp::getHLFIRVariableType(mlir::Type inputType,
 }
 
 static bool hasExplicitLowerBounds(mlir::Value shape) {
-  return shape && shape.getType().isa<fir::ShapeShiftType, fir::ShiftType>();
+  return shape &&
+         mlir::isa<fir::ShapeShiftType, fir::ShiftType>(shape.getType());
 }
 
 void hlfir::DeclareOp::build(mlir::OpBuilder &builder,
@@ -288,7 +289,7 @@ mlir::LogicalResult hlfir::DesignateOp::verify() {
   bool hasBoxComponent;
   if (getComponent()) {
     auto component = getComponent().value();
-    auto recType = baseElementType.dyn_cast<fir::RecordType>();
+    auto recType = mlir::dyn_cast<fir::RecordType>(baseElementType);
     if (!recType)
       return emitOpError(
           "component must be provided only when the memref is a derived type");
@@ -300,14 +301,14 @@ mlir::LogicalResult hlfir::DesignateOp::verify() {
     }
     mlir::Type fieldType = recType.getType(fieldIdx);
     mlir::Type componentBaseType = getFortranElementOrSequenceType(fieldType);
-    hasBoxComponent = fieldType.isa<fir::BaseBoxType>();
-    if (componentBaseType.isa<fir::SequenceType>() &&
-        baseType.isa<fir::SequenceType>() &&
+    hasBoxComponent = mlir::isa<fir::BaseBoxType>(fieldType);
+    if (mlir::isa<fir::SequenceType>(componentBaseType) &&
+        mlir::isa<fir::SequenceType>(baseType) &&
         (numSubscripts == 0 || subscriptsRank > 0))
       return emitOpError("indices must be provided and must not contain "
                          "triplets when both memref and component are arrays");
     if (numSubscripts != 0) {
-      if (!componentBaseType.isa<fir::SequenceType>())
+      if (!mlir::isa<fir::SequenceType>(componentBaseType))
         return emitOpError("indices must not be provided if component appears "
                            "and is not an array component");
       if (!getComponentShape())
@@ -315,9 +316,9 @@ mlir::LogicalResult hlfir::DesignateOp::verify() {
             "component_shape must be provided when indexing a component");
       mlir::Type compShapeType = getComponentShape().getType();
       unsigned componentRank =
-          componentBaseType.cast<fir::SequenceType>().getDimension();
-      auto shapeType = compShapeType.dyn_cast<fir::ShapeType>();
-      auto shapeShiftType = compShapeType.dyn_cast<fir::ShapeShiftType>();
+          mlir::cast<fir::SequenceType>(componentBaseType).getDimension();
+      auto shapeType = mlir::dyn_cast<fir::ShapeType>(compShapeType);
+      auto shapeShiftType = mlir::dyn_cast<fir::ShapeShiftType>(compShapeType);
       if (!((shapeType && shapeType.getRank() == componentRank) ||
             (shapeShiftType && shapeShiftType.getRank() == componentRank)))
         return emitOpError("component_shape must be a fir.shape or "
@@ -325,33 +326,33 @@ mlir::LogicalResult hlfir::DesignateOp::verify() {
       if (numSubscripts > componentRank)
         return emitOpError("indices number must match array component rank");
     }
-    if (auto baseSeqType = baseType.dyn_cast<fir::SequenceType>())
+    if (auto baseSeqType = mlir::dyn_cast<fir::SequenceType>(baseType))
       // This case must come first to cover "array%array_comp(i, j)" that has
       // subscripts for the component but whose rank come from the base.
       outputRank = baseSeqType.getDimension();
     else if (numSubscripts != 0)
       outputRank = subscriptsRank;
     else if (auto componentSeqType =
-                 componentBaseType.dyn_cast<fir::SequenceType>())
+                 mlir::dyn_cast<fir::SequenceType>(componentBaseType))
       outputRank = componentSeqType.getDimension();
     outputElementType = fir::unwrapSequenceType(componentBaseType);
   } else {
     outputElementType = baseElementType;
     unsigned baseTypeRank =
-        baseType.isa<fir::SequenceType>()
-            ? baseType.cast<fir::SequenceType>().getDimension()
+        mlir::isa<fir::SequenceType>(baseType)
+            ? mlir::cast<fir::SequenceType>(baseType).getDimension()
             : 0;
     if (numSubscripts != 0) {
       if (baseTypeRank != numSubscripts)
         return emitOpError("indices number must match memref rank");
       outputRank = subscriptsRank;
-    } else if (auto baseSeqType = baseType.dyn_cast<fir::SequenceType>()) {
+    } else if (auto baseSeqType = mlir::dyn_cast<fir::SequenceType>(baseType)) {
       outputRank = baseSeqType.getDimension();
     }
   }
 
   if (!getSubstring().empty()) {
-    if (!outputElementType.isa<fir::CharacterType>())
+    if (!mlir::isa<fir::CharacterType>(outputElementType))
       return emitOpError("memref or component must have character type if "
                          "substring indices are provided");
     if (getSubstring().size() != 2)
@@ -361,16 +362,16 @@ mlir::LogicalResult hlfir::DesignateOp::verify() {
     if (!fir::isa_complex(outputElementType))
       return emitOpError("memref or component must have complex type if "
                          "complex_part is provided");
-    if (auto firCplx = outputElementType.dyn_cast<fir::ComplexType>())
+    if (auto firCplx = mlir::dyn_cast<fir::ComplexType>(outputElementType))
       outputElementType = firCplx.getElementType();
     else
       outputElementType =
-          outputElementType.cast<mlir::ComplexType>().getElementType();
+          mlir::cast<mlir::ComplexType>(outputElementType).getElementType();
   }
   mlir::Type resultBaseType =
       getFortranElementOrSequenceType(getResult().getType());
   unsigned resultRank = 0;
-  if (auto resultSeqType = resultBaseType.dyn_cast<fir::SequenceType>())
+  if (auto resultSeqType = mlir::dyn_cast<fir::SequenceType>(resultBaseType))
     resultRank = resultSeqType.getDimension();
   if (resultRank != outputRank)
     return emitOpError("result type rank is not consistent with operands, "
@@ -380,10 +381,10 @@ mlir::LogicalResult hlfir::DesignateOp::verify() {
   // result type must match the one that was inferred here, except the character
   // length may differ because of substrings.
   if (resultElementType != outputElementType &&
-      !(resultElementType.isa<fir::CharacterType>() &&
-        outputElementType.isa<fir::CharacterType>()) &&
-      !(resultElementType.isa<mlir::FloatType>() &&
-        outputElementType.isa<fir::RealType>()))
+      !(mlir::isa<fir::CharacterType>(resultElementType) &&
+        mlir::isa<fir::CharacterType>(outputElementType)) &&
+      !(mlir::isa<mlir::FloatType>(resultElementType) &&
+        mlir::isa<fir::RealType>(outputElementType)))
     return emitOpError(
                "result element type is not consistent with operands, expected ")
            << outputElementType;
@@ -401,22 +402,22 @@ mlir::LogicalResult hlfir::DesignateOp::verify() {
       return emitOpError("shape must be provided if and only if the result is "
                          "an array that is not a box address");
     if (resultRank != 0) {
-      auto shapeType = getShape().getType().dyn_cast<fir::ShapeType>();
+      auto shapeType = mlir::dyn_cast<fir::ShapeType>(getShape().getType());
       auto shapeShiftType =
-          getShape().getType().dyn_cast<fir::ShapeShiftType>();
+          mlir::dyn_cast<fir::ShapeShiftType>(getShape().getType());
       if (!((shapeType && shapeType.getRank() == resultRank) ||
             (shapeShiftType && shapeShiftType.getRank() == resultRank)))
         return emitOpError("shape must be a fir.shape or fir.shapeshift with "
                            "the rank of the result");
     }
     auto numLenParam = getTypeparams().size();
-    if (outputElementType.isa<fir::CharacterType>()) {
+    if (mlir::isa<fir::CharacterType>(outputElementType)) {
       if (numLenParam != 1)
         return emitOpError("must be provided one length parameter when the "
                            "result is a character");
     } else if (fir::isRecordWithTypeParameters(outputElementType)) {
       if (numLenParam !=
-          outputElementType.cast<fir::RecordType>().getNumLenParams())
+          mlir::cast<fir::RecordType>(outputElementType).getNumLenParams())
         return emitOpError("must be provided the same number of length "
                            "parameters as in the result derived type");
     } else if (numLenParam != 0) {
@@ -434,18 +435,18 @@ mlir::LogicalResult hlfir::DesignateOp::verify() {
 mlir::LogicalResult hlfir::ParentComponentOp::verify() {
   mlir::Type baseType =
       hlfir::getFortranElementOrSequenceType(getMemref().getType());
-  auto maybeInputSeqType = baseType.dyn_cast<fir::SequenceType>();
+  auto maybeInputSeqType = mlir::dyn_cast<fir::SequenceType>(baseType);
   unsigned inputTypeRank =
       maybeInputSeqType ? maybeInputSeqType.getDimension() : 0;
   unsigned shapeRank = 0;
   if (mlir::Value shape = getShape())
-    if (auto shapeType = shape.getType().dyn_cast<fir::ShapeType>())
+    if (auto shapeType = mlir::dyn_cast<fir::ShapeType>(shape.getType()))
       shapeRank = shapeType.getRank();
   if (inputTypeRank != shapeRank)
     return emitOpError(
         "must be provided a shape if and only if the base is an array");
   mlir::Type outputBaseType = hlfir::getFortranElementOrSequenceType(getType());
-  auto maybeOutputSeqType = outputBaseType.dyn_cast<fir::SequenceType>();
+  auto maybeOutputSeqType = mlir::dyn_cast<fir::SequenceType>(outputBaseType);
   unsigned outputTypeRank =
       maybeOutputSeqType ? maybeOutputSeqType.getDimension() : 0;
   if (inputTypeRank != outputTypeRank)
@@ -459,23 +460,23 @@ mlir::LogicalResult hlfir::ParentComponentOp::verify() {
           return emitOpError(
               "result type extents are inconsistent with memref type");
   fir::RecordType baseRecType =
-      hlfir::getFortranElementType(baseType).dyn_cast<fir::RecordType>();
-  fir::RecordType outRecType =
-      hlfir::getFortranElementType(outputBaseType).dyn_cast<fir::RecordType>();
+      mlir::dyn_cast<fir::RecordType>(hlfir::getFortranElementType(baseType));
+  fir::RecordType outRecType = mlir::dyn_cast<fir::RecordType>(
+      hlfir::getFortranElementType(outputBaseType));
   if (!baseRecType || !outRecType)
     return emitOpError("result type and input type must be derived types");
 
   // Note: result should not be a fir.class: its dynamic type is being set to
   // the parent type and allowing fir.class would break the operation codegen:
   // it would keep the input dynamic type.
-  if (getType().isa<fir::ClassType>())
+  if (mlir::isa<fir::ClassType>(getType()))
     return emitOpError("result type must not be polymorphic");
 
   // The array results are known to not be dis-contiguous in most cases (the
   // exception being if the parent type was extended by a type without any
   // components): require a fir.box to be used for the result to carry the
   // strides.
-  if (!getType().isa<fir::BoxType>() &&
+  if (!mlir::isa<fir::BoxType>(getType()) &&
       (outputTypeRank != 0 || fir::isRecordWithTypeParameters(outRecType)))
     return emitOpError("result type must be a fir.box if the result is an "
                        "array or has length parameters");
@@ -496,9 +497,8 @@ verifyLogicalReductionOp(LogicalReductionOp reductionOp) {
   mlir::Value mask = reductionOp->getMask();
   mlir::Value dim = reductionOp->getDim();
 
-  fir::SequenceType maskTy =
-      hlfir::getFortranElementOrSequenceType(mask.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType maskTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(mask.getType()));
   mlir::Type logicalTy = maskTy.getEleTy();
   llvm::ArrayRef<int64_t> maskShape = maskTy.getShape();
 
@@ -576,9 +576,8 @@ mlir::LogicalResult hlfir::CountOp::verify() {
   mlir::Value mask = getMask();
   mlir::Value dim = getDim();
 
-  fir::SequenceType maskTy =
-      hlfir::getFortranElementOrSequenceType(mask.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType maskTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(mask.getType()));
   llvm::ArrayRef<int64_t> maskShape = maskTy.getShape();
 
   mlir::Type resultType = results[0];
@@ -613,13 +612,14 @@ void hlfir::CountOp::getEffects(
 //===----------------------------------------------------------------------===//
 
 static unsigned getCharacterKind(mlir::Type t) {
-  return hlfir::getFortranElementType(t).cast<fir::CharacterType>().getFKind();
+  return mlir::cast<fir::CharacterType>(hlfir::getFortranElementType(t))
+      .getFKind();
 }
 
 static std::optional<fir::CharacterType::LenType>
 getCharacterLengthIfStatic(mlir::Type t) {
   if (auto charType =
-          hlfir::getFortranElementType(t).dyn_cast<fir::CharacterType>())
+          mlir::dyn_cast<fir::CharacterType>(hlfir::getFortranElementType(t)))
     if (charType.hasConstantLen())
       return charType.getLen();
   return std::nullopt;
@@ -672,15 +672,13 @@ verifyArrayAndMaskForReductionOp(NumericalReductionOp reductionOp) {
   mlir::Value array = reductionOp->getArray();
   mlir::Value mask = reductionOp->getMask();
 
-  fir::SequenceType arrayTy =
-      hlfir::getFortranElementOrSequenceType(array.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType arrayTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(array.getType()));
   llvm::ArrayRef<int64_t> arrayShape = arrayTy.getShape();
 
   if (mask) {
-    fir::SequenceType maskSeq =
-        hlfir::getFortranElementOrSequenceType(mask.getType())
-            .dyn_cast<fir::SequenceType>();
+    fir::SequenceType maskSeq = mlir::dyn_cast<fir::SequenceType>(
+        hlfir::getFortranElementOrSequenceType(mask.getType()));
     llvm::ArrayRef<int64_t> maskShape;
 
     if (maskSeq)
@@ -720,9 +718,8 @@ verifyNumericalReductionOp(NumericalReductionOp reductionOp) {
 
   mlir::Value array = reductionOp->getArray();
   mlir::Value dim = reductionOp->getDim();
-  fir::SequenceType arrayTy =
-      hlfir::getFortranElementOrSequenceType(array.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType arrayTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(array.getType()));
   mlir::Type numTy = arrayTy.getEleTy();
   llvm::ArrayRef<int64_t> arrayShape = arrayTy.getShape();
 
@@ -790,13 +787,12 @@ verifyCharacterReductionOp(CharacterReductionOp reductionOp) {
 
   mlir::Value array = reductionOp->getArray();
   mlir::Value dim = reductionOp->getDim();
-  fir::SequenceType arrayTy =
-      hlfir::getFortranElementOrSequenceType(array.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType arrayTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(array.getType()));
   mlir::Type numTy = arrayTy.getEleTy();
   llvm::ArrayRef<int64_t> arrayShape = arrayTy.getShape();
 
-  auto resultExpr = results[0].cast<hlfir::ExprType>();
+  auto resultExpr = mlir::cast<hlfir::ExprType>(results[0]);
   mlir::Type resultType = resultExpr.getEleTy();
   assert(mlir::isa<fir::CharacterType>(resultType) &&
          "result must be character");
@@ -881,9 +877,8 @@ verifyResultForMinMaxLoc(NumericalReductionOp reductionOp) {
 
   mlir::Value array = reductionOp->getArray();
   mlir::Value dim = reductionOp->getDim();
-  fir::SequenceType arrayTy =
-      hlfir::getFortranElementOrSequenceType(array.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType arrayTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(array.getType()));
   llvm::ArrayRef<int64_t> arrayShape = arrayTy.getShape();
 
   mlir::Type resultType = results[0];
@@ -993,12 +988,10 @@ void hlfir::SumOp::getEffects(
 mlir::LogicalResult hlfir::DotProductOp::verify() {
   mlir::Value lhs = getLhs();
   mlir::Value rhs = getRhs();
-  fir::SequenceType lhsTy =
-      hlfir::getFortranElementOrSequenceType(lhs.getType())
-          .cast<fir::SequenceType>();
-  fir::SequenceType rhsTy =
-      hlfir::getFortranElementOrSequenceType(rhs.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType lhsTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(lhs.getType()));
+  fir::SequenceType rhsTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(rhs.getType()));
   llvm::ArrayRef<int64_t> lhsShape = lhsTy.getShape();
   llvm::ArrayRef<int64_t> rhsShape = rhsTy.getShape();
   std::size_t lhsRank = lhsShape.size();
@@ -1051,19 +1044,17 @@ void hlfir::DotProductOp::getEffects(
 mlir::LogicalResult hlfir::MatmulOp::verify() {
   mlir::Value lhs = getLhs();
   mlir::Value rhs = getRhs();
-  fir::SequenceType lhsTy =
-      hlfir::getFortranElementOrSequenceType(lhs.getType())
-          .cast<fir::SequenceType>();
-  fir::SequenceType rhsTy =
-      hlfir::getFortranElementOrSequenceType(rhs.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType lhsTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(lhs.getType()));
+  fir::SequenceType rhsTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(rhs.getType()));
   llvm::ArrayRef<int64_t> lhsShape = lhsTy.getShape();
   llvm::ArrayRef<int64_t> rhsShape = rhsTy.getShape();
   std::size_t lhsRank = lhsShape.size();
   std::size_t rhsRank = rhsShape.size();
   mlir::Type lhsEleTy = lhsTy.getEleTy();
   mlir::Type rhsEleTy = rhsTy.getEleTy();
-  hlfir::ExprType resultTy = getResult().getType().cast<hlfir::ExprType>();
+  hlfir::ExprType resultTy = mlir::cast<hlfir::ExprType>(getResult().getType());
   llvm::ArrayRef<int64_t> resultShape = resultTy.getShape();
   mlir::Type resultEleTy = resultTy.getEleTy();
 
@@ -1180,13 +1171,12 @@ void hlfir::MatmulOp::getEffects(
 
 mlir::LogicalResult hlfir::TransposeOp::verify() {
   mlir::Value array = getArray();
-  fir::SequenceType arrayTy =
-      hlfir::getFortranElementOrSequenceType(array.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType arrayTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(array.getType()));
   llvm::ArrayRef<int64_t> inShape = arrayTy.getShape();
   std::size_t rank = inShape.size();
   mlir::Type eleTy = arrayTy.getEleTy();
-  hlfir::ExprType resultTy = getResult().getType().cast<hlfir::ExprType>();
+  hlfir::ExprType resultTy = mlir::cast<hlfir::ExprType>(getResult().getType());
   llvm::ArrayRef<int64_t> resultShape = resultTy.getShape();
   std::size_t resultRank = resultShape.size();
   mlir::Type resultEleTy = resultTy.getEleTy();
@@ -1224,19 +1214,17 @@ void hlfir::TransposeOp::getEffects(
 mlir::LogicalResult hlfir::MatmulTransposeOp::verify() {
   mlir::Value lhs = getLhs();
   mlir::Value rhs = getRhs();
-  fir::SequenceType lhsTy =
-      hlfir::getFortranElementOrSequenceType(lhs.getType())
-          .cast<fir::SequenceType>();
-  fir::SequenceType rhsTy =
-      hlfir::getFortranElementOrSequenceType(rhs.getType())
-          .cast<fir::SequenceType>();
+  fir::SequenceType lhsTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(lhs.getType()));
+  fir::SequenceType rhsTy = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(rhs.getType()));
   llvm::ArrayRef<int64_t> lhsShape = lhsTy.getShape();
   llvm::ArrayRef<int64_t> rhsShape = rhsTy.getShape();
   std::size_t lhsRank = lhsShape.size();
   std::size_t rhsRank = rhsShape.size();
   mlir::Type lhsEleTy = lhsTy.getEleTy();
   mlir::Type rhsEleTy = rhsTy.getEleTy();
-  hlfir::ExprType resultTy = getResult().getType().cast<hlfir::ExprType>();
+  hlfir::ExprType resultTy = mlir::cast<hlfir::ExprType>(getResult().getType());
   llvm::ArrayRef<int64_t> resultShape = resultTy.getShape();
   mlir::Type resultEleTy = resultTy.getEleTy();
 
@@ -1381,7 +1369,7 @@ void hlfir::AsExprOp::build(mlir::OpBuilder &builder,
   hlfir::ExprType::Shape typeShape;
   bool isPolymorphic = fir::isPolymorphicType(var.getType());
   mlir::Type type = getFortranElementOrSequenceType(var.getType());
-  if (auto seqType = type.dyn_cast<fir::SequenceType>()) {
+  if (auto seqType = mlir::dyn_cast<fir::SequenceType>(type)) {
     typeShape.append(seqType.getShape().begin(), seqType.getShape().end());
     type = seqType.getEleTy();
   }
@@ -1427,7 +1415,7 @@ static void buildElemental(mlir::OpBuilder &builder,
                           isUnordered ? builder.getUnitAttr() : nullptr);
   mlir::Region *bodyRegion = odsState.addRegion();
   bodyRegion->push_back(new mlir::Block{});
-  if (auto shapeType = shape.getType().dyn_cast<fir::ShapeType>()) {
+  if (auto shapeType = mlir::dyn_cast<fir::ShapeType>(shape.getType())) {
     unsigned dim = shapeType.getRank();
     mlir::Type indexType = builder.getIndexType();
     for (unsigned d = 0; d < dim; ++d)
@@ -1468,7 +1456,7 @@ void hlfir::ApplyOp::build(mlir::OpBuilder &builder,
                            mlir::ValueRange indices,
                            mlir::ValueRange typeparams) {
   mlir::Type resultType = expr.getType();
-  if (auto exprType = resultType.dyn_cast<hlfir::ExprType>())
+  if (auto exprType = mlir::dyn_cast<hlfir::ExprType>(resultType))
     resultType = exprType.getElementExprType();
   build(builder, odsState, resultType, expr, indices, typeparams);
 }
@@ -1517,20 +1505,20 @@ void hlfir::CopyInOp::build(mlir::OpBuilder &builder,
 
 void hlfir::ShapeOfOp::build(mlir::OpBuilder &builder,
                              mlir::OperationState &result, mlir::Value expr) {
-  hlfir::ExprType exprTy = expr.getType().cast<hlfir::ExprType>();
+  hlfir::ExprType exprTy = mlir::cast<hlfir::ExprType>(expr.getType());
   mlir::Type type = fir::ShapeType::get(builder.getContext(), exprTy.getRank());
   build(builder, result, type, expr);
 }
 
 std::size_t hlfir::ShapeOfOp::getRank() {
   mlir::Type resTy = getResult().getType();
-  fir::ShapeType shape = resTy.cast<fir::ShapeType>();
+  fir::ShapeType shape = mlir::cast<fir::ShapeType>(resTy);
   return shape.getRank();
 }
 
 mlir::LogicalResult hlfir::ShapeOfOp::verify() {
   mlir::Value expr = getExpr();
-  hlfir::ExprType exprTy = expr.getType().cast<hlfir::ExprType>();
+  hlfir::ExprType exprTy = mlir::cast<hlfir::ExprType>(expr.getType());
   std::size_t exprRank = exprTy.getShape().size();
 
   if (exprRank == 0)
@@ -1549,7 +1537,8 @@ hlfir::ShapeOfOp::canonicalize(ShapeOfOp shapeOf,
   // if extent information is available at compile time, immediately fold the
   // hlfir.shape_of into a fir.shape
   mlir::Location loc = shapeOf.getLoc();
-  hlfir::ExprType expr = shapeOf.getExpr().getType().cast<hlfir::ExprType>();
+  hlfir::ExprType expr =
+      mlir::cast<hlfir::ExprType>(shapeOf.getExpr().getType());
 
   mlir::Value shape = hlfir::genExprShape(rewriter, loc, expr);
   if (!shape)
@@ -1574,7 +1563,7 @@ void hlfir::GetExtentOp::build(mlir::OpBuilder &builder,
 }
 
 mlir::LogicalResult hlfir::GetExtentOp::verify() {
-  fir::ShapeType shapeTy = getShape().getType().cast<fir::ShapeType>();
+  fir::ShapeType shapeTy = mlir::cast<fir::ShapeType>(getShape().getType());
   std::uint64_t rank = shapeTy.getRank();
   llvm::APInt dim = getDim();
   if (dim.sge(rank))
@@ -1709,10 +1698,11 @@ mlir::LogicalResult hlfir::ElementalAddrOp::verify() {
     return emitOpError("body region must be terminated by an hlfir.yield");
   mlir::Type elementAddrType = yieldOp.getEntity().getType();
   if (!hlfir::isFortranVariableType(elementAddrType) ||
-      hlfir::getFortranElementOrSequenceType(elementAddrType)
-          .isa<fir::SequenceType>())
+      mlir::isa<fir::SequenceType>(
+          hlfir::getFortranElementOrSequenceType(elementAddrType)))
     return emitOpError("body must compute the address of a scalar entity");
-  unsigned shapeRank = getShape().getType().cast<fir::ShapeType>().getRank();
+  unsigned shapeRank =
+      mlir::cast<fir::ShapeType>(getShape().getType()).getRank();
   if (shapeRank != getIndices().size())
     return emitOpError("body number of indices must match shape rank");
   return mlir::success();
@@ -1817,8 +1807,8 @@ static bool yieldsLogical(mlir::Region &region, bool mustBeScalarI1) {
   if (mustBeScalarI1)
     return hlfir::isI1Type(yieldType);
   return hlfir::isMaskArgument(yieldType) &&
-         hlfir::getFortranElementOrSequenceType(yieldType)
-             .isa<fir::SequenceType>();
+         mlir::isa<fir::SequenceType>(
+             hlfir::getFortranElementOrSequenceType(yieldType));
 }
 
 mlir::LogicalResult hlfir::ForallMaskOp::verify() {
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
index 1c4f82e2de81..d4e4835ee726 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
@@ -77,7 +77,7 @@ static mlir::Value packageBufferizedExpr(mlir::Location loc,
 /// currently enforced by the verifiers that only accept HLFIR value or
 /// variable types which do not include tuples.
 static hlfir::Entity getBufferizedExprStorage(mlir::Value bufferizedExpr) {
-  auto tupleType = bufferizedExpr.getType().dyn_cast<mlir::TupleType>();
+  auto tupleType = mlir::dyn_cast<mlir::TupleType>(bufferizedExpr.getType());
   if (!tupleType)
     return hlfir::Entity{bufferizedExpr};
   assert(tupleType.size() == 2 && "unexpected tuple type");
@@ -90,7 +90,7 @@ static hlfir::Entity getBufferizedExprStorage(mlir::Value bufferizedExpr) {
 /// Helper to extract the clean-up flag from a tuple created by
 /// packageBufferizedExpr.
 static mlir::Value getBufferizedExprMustFreeFlag(mlir::Value bufferizedExpr) {
-  auto tupleType = bufferizedExpr.getType().dyn_cast<mlir::TupleType>();
+  auto tupleType = mlir::dyn_cast<mlir::TupleType>(bufferizedExpr.getType());
   if (!tupleType)
     return bufferizedExpr;
   assert(tupleType.size() == 2 && "unexpected tuple type");
@@ -218,7 +218,7 @@ struct ShapeOfOpConversion
     } else {
       // everything else failed so try to create a shape from static type info
       hlfir::ExprType exprTy =
-          adaptor.getExpr().getType().dyn_cast_or_null<hlfir::ExprType>();
+          mlir::dyn_cast_or_null<hlfir::ExprType>(adaptor.getExpr().getType());
       if (exprTy)
         shape = hlfir::genExprShape(builder, loc, exprTy);
     }
@@ -480,10 +480,10 @@ struct AssociateOpConversion
           assert(mlir::isa<fir::ClassType>(sourceVar.getType()) &&
                  fir::isAllocatableType(sourceVar.getType()));
           assert(sourceVar.getType() == assocType);
-        } else if ((sourceVar.getType().isa<fir::BaseBoxType>() &&
-                    !assocType.isa<fir::BaseBoxType>()) ||
-                   ((sourceVar.getType().isa<fir::BoxCharType>() &&
-                     !assocType.isa<fir::BoxCharType>()))) {
+        } else if ((mlir::isa<fir::BaseBoxType>(sourceVar.getType()) &&
+                    !mlir::isa<fir::BaseBoxType>(assocType)) ||
+                   ((mlir::isa<fir::BoxCharType>(sourceVar.getType()) &&
+                     !mlir::isa<fir::BoxCharType>(assocType)))) {
           sourceVar = builder.create<fir::BoxAddrOp>(loc, assocType, sourceVar);
         } else {
           sourceVar = builder.createConvert(loc, assocType, sourceVar);
@@ -590,13 +590,13 @@ static void genBufferDestruction(mlir::Location loc, fir::FirOpBuilder &builder,
       // for MERGE with polymorphic results.
       if (mustFinalize)
         TODO(loc, "finalizing polymorphic temporary in HLFIR");
-    } else if (var.getType().isa<fir::BaseBoxType, fir::BoxCharType>()) {
+    } else if (mlir::isa<fir::BaseBoxType, fir::BoxCharType>(var.getType())) {
       if (mustFinalize && !mlir::isa<fir::BaseBoxType>(var.getType()))
         fir::emitFatalError(loc, "non-finalizable variable");
 
       addr = builder.create<fir::BoxAddrOp>(loc, heapType, var);
     } else {
-      if (!var.getType().isa<fir::HeapType>())
+      if (!mlir::isa<fir::HeapType>(var.getType()))
         addr = builder.create<fir::ConvertOp>(loc, heapType, var);
 
       if (mustFinalize || deallocComponents) {
@@ -831,7 +831,7 @@ struct ElementalOpConversion
       // the assign, insert an hlfir.destroy to mark the expression end-of-life.
       // If the expression creation allocated a buffer on the heap inside the
       // loop, this will ensure the buffer properly deallocated.
-      if (elementValue.getType().isa<hlfir::ExprType>() &&
+      if (mlir::isa<hlfir::ExprType>(elementValue.getType()) &&
           wasCreatedInCurrentBlock(elementValue, builder))
         builder.create<hlfir::DestroyOp>(loc, elementValue);
     }
@@ -926,11 +926,12 @@ public:
                         hlfir::EndAssociateOp, hlfir::SetLengthOp>();
 
     target.markUnknownOpDynamicallyLegal([](mlir::Operation *op) {
-      return llvm::all_of(
-                 op->getResultTypes(),
-                 [](mlir::Type ty) { return !ty.isa<hlfir::ExprType>(); }) &&
+      return llvm::all_of(op->getResultTypes(),
+                          [](mlir::Type ty) {
+                            return !mlir::isa<hlfir::ExprType>(ty);
+                          }) &&
              llvm::all_of(op->getOperandTypes(), [](mlir::Type ty) {
-               return !ty.isa<hlfir::ExprType>();
+               return !mlir::isa<hlfir::ExprType>(ty);
              });
     });
     if (mlir::failed(
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
index cd534bae4ad2..517285dce133 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
@@ -34,7 +34,7 @@ using namespace mlir;
 static mlir::Value genAllocatableTempFromSourceBox(mlir::Location loc,
                                                    fir::FirOpBuilder &builder,
                                                    mlir::Value sourceBox) {
-  assert(sourceBox.getType().isa<fir::BaseBoxType>() &&
+  assert(mlir::isa<fir::BaseBoxType>(sourceBox.getType()) &&
          "must be a base box type");
   // Use the runtime to make a quick and dirty temp with the rhs value.
   // Overkill for scalar rhs that could be done in much more clever ways.
@@ -44,7 +44,7 @@ static mlir::Value genAllocatableTempFromSourceBox(mlir::Location loc,
   // This has the huge benefit of dealing with all cases, including
   // polymorphic entities.
   mlir::Type fromHeapType = fir::HeapType::get(fir::unwrapRefType(
-      sourceBox.getType().cast<fir::BaseBoxType>().getEleTy()));
+      mlir::cast<fir::BaseBoxType>(sourceBox.getType()).getEleTy()));
   mlir::Type fromBoxHeapType = fir::BoxType::get(fromHeapType);
   mlir::Value fromMutableBox =
       fir::factory::genNullBoxStorage(builder, loc, fromBoxHeapType);
@@ -69,7 +69,7 @@ public:
     auto module = assignOp->getParentOfType<mlir::ModuleOp>();
     fir::FirOpBuilder builder(rewriter, module);
 
-    if (rhs.getType().isa<hlfir::ExprType>()) {
+    if (mlir::isa<hlfir::ExprType>(rhs.getType())) {
       mlir::emitError(loc, "hlfir must be bufferized with --bufferize-hlfir "
                            "pass before being converted to FIR");
       return mlir::failure();
@@ -343,16 +343,15 @@ public:
     auto firBase = firDeclareOp.getResult();
     mlir::Value hlfirBase;
     mlir::Type hlfirBaseType = declareOp.getBase().getType();
-    if (hlfirBaseType.isa<fir::BaseBoxType>()) {
+    if (mlir::isa<fir::BaseBoxType>(hlfirBaseType)) {
       fir::FirOpBuilder builder(rewriter, declareOp.getOperation());
       // Helper to generate the hlfir fir.box with the local lower bounds and
       // type parameters.
       auto genHlfirBox = [&]() -> mlir::Value {
-        if (!firBase.getType().isa<fir::BaseBoxType>()) {
+        if (!mlir::isa<fir::BaseBoxType>(firBase.getType())) {
           llvm::SmallVector<mlir::Value> typeParams;
-          auto maybeCharType =
-              fir::unwrapSequenceType(fir::unwrapPassByRefType(hlfirBaseType))
-                  .dyn_cast<fir::CharacterType>();
+          auto maybeCharType = mlir::dyn_cast<fir::CharacterType>(
+              fir::unwrapSequenceType(fir::unwrapPassByRefType(hlfirBaseType)));
           if (!maybeCharType || maybeCharType.hasDynamicLen())
             typeParams.append(declareOp.getTypeparams().begin(),
                               declareOp.getTypeparams().end());
@@ -399,7 +398,7 @@ public:
                         })
                         .getResults()[0];
       }
-    } else if (hlfirBaseType.isa<fir::BoxCharType>()) {
+    } else if (mlir::isa<fir::BoxCharType>(hlfirBaseType)) {
       assert(declareOp.getTypeparams().size() == 1 &&
              "must contain character length");
       hlfirBase = rewriter.create<fir::EmboxCharOp>(
@@ -480,11 +479,12 @@ public:
         // - scalar%scalar_component [substring|complex_part] or
         // - scalar%static_size_array_comp
         // - scalar%array(indices) [substring| complex part]
-        mlir::Type componentType = baseEleTy.cast<fir::RecordType>().getType(
-            designate.getComponent().value());
+        mlir::Type componentType =
+            mlir::cast<fir::RecordType>(baseEleTy).getType(
+                designate.getComponent().value());
         mlir::Type coorTy = fir::ReferenceType::get(componentType);
         base = builder.create<fir::CoordinateOp>(loc, coorTy, base, fieldIndex);
-        if (componentType.isa<fir::BaseBoxType>()) {
+        if (mlir::isa<fir::BaseBoxType>(componentType)) {
           auto variableInterface = mlir::cast<fir::FortranVariableOpInterface>(
               designate.getOperation());
           if (variableInterface.isAllocatable() ||
@@ -500,14 +500,14 @@ public:
       } else {
         // array%component[(indices) substring|complex part] cases.
         // Component ref of array bases are dealt with below in embox/rebox.
-        assert(designateResultType.isa<fir::BaseBoxType>());
+        assert(mlir::isa<fir::BaseBoxType>(designateResultType));
       }
     }
 
-    if (designateResultType.isa<fir::BaseBoxType>()) {
+    if (mlir::isa<fir::BaseBoxType>(designateResultType)) {
       // Generate embox or rebox.
       mlir::Type eleTy = fir::unwrapPassByRefType(designateResultType);
-      bool isScalarDesignator = !eleTy.isa<fir::SequenceType>();
+      bool isScalarDesignator = !mlir::isa<fir::SequenceType>(eleTy);
       mlir::Value sourceBox;
       if (isScalarDesignator) {
         // The base box will be used for emboxing the scalar element.
@@ -583,7 +583,7 @@ public:
         assert(sliceFields.empty() && substring.empty());
       llvm::SmallVector<mlir::Type> resultType{designateResultType};
       mlir::Value resultBox;
-      if (base.getType().isa<fir::BaseBoxType>())
+      if (mlir::isa<fir::BaseBoxType>(base.getType()))
         resultBox =
             builder.create<fir::ReboxOp>(loc, resultType, base, shape, slice);
       else
@@ -598,7 +598,8 @@ public:
     // first element of a contiguous array section with compile time constant
     // shape. The base may be an array, or a scalar.
     mlir::Type resultAddressType = designateResultType;
-    if (auto boxCharType = designateResultType.dyn_cast<fir::BoxCharType>())
+    if (auto boxCharType =
+            mlir::dyn_cast<fir::BoxCharType>(designateResultType))
       resultAddressType = fir::ReferenceType::get(boxCharType.getEleTy());
 
     // Array element indexing.
@@ -620,7 +621,7 @@ public:
     // Scalar complex part ref
     if (designate.getComplexPart()) {
       // Sequence types should have already been handled by this point
-      assert(!designateResultType.isa<fir::SequenceType>());
+      assert(!mlir::isa<fir::SequenceType>(designateResultType));
       auto index = builder.createIntegerConstant(loc, builder.getIndexType(),
                                                  *designate.getComplexPart());
       auto coorTy = fir::ReferenceType::get(resultEleTy);
@@ -628,7 +629,7 @@ public:
     }
 
     // Cast/embox the computed scalar address if needed.
-    if (designateResultType.isa<fir::BoxCharType>()) {
+    if (mlir::isa<fir::BoxCharType>(designateResultType)) {
       assert(designate.getTypeparams().size() == 1 &&
              "must have character length");
       auto emboxChar = builder.create<fir::EmboxCharOp>(
@@ -671,13 +672,13 @@ public:
                   mlir::PatternRewriter &rewriter) const override {
     mlir::Location loc = parentComponent.getLoc();
     mlir::Type resultType = parentComponent.getType();
-    if (!parentComponent.getType().isa<fir::BoxType>()) {
+    if (!mlir::isa<fir::BoxType>(parentComponent.getType())) {
       mlir::Value baseAddr = parentComponent.getMemref();
       // Scalar parent component ref without any length type parameters. The
       // input may be a fir.class if it is polymorphic, since this is a scalar
       // and the output will be monomorphic, the base address can be extracted
       // from the fir.class.
-      if (baseAddr.getType().isa<fir::BaseBoxType>())
+      if (mlir::isa<fir::BaseBoxType>(baseAddr.getType()))
         baseAddr = rewriter.create<fir::BoxAddrOp>(loc, baseAddr);
       rewriter.replaceOpWithNewOp<fir::ConvertOp>(parentComponent, resultType,
                                                   baseAddr);
@@ -686,7 +687,7 @@ public:
     // Array parent component ref or PDTs.
     hlfir::Entity base{parentComponent.getMemref()};
     mlir::Value baseAddr = base.getBase();
-    if (!baseAddr.getType().isa<fir::BaseBoxType>()) {
+    if (!mlir::isa<fir::BaseBoxType>(baseAddr.getType())) {
       // Embox cannot directly be used to address parent components: it expects
       // the output type to match the input type when there are no slices. When
       // the types have at least one component, a slice to the first element can
@@ -748,7 +749,7 @@ public:
     // the hlfir.shape_of operation which led to the creation of this get_extent
     // operation should now have been lowered to a fir.shape operation
     if (auto s = mlir::dyn_cast_or_null<fir::ShapeOp>(shapeOp)) {
-      fir::ShapeType shapeTy = shape.getType().cast<fir::ShapeType>();
+      fir::ShapeType shapeTy = mlir::cast<fir::ShapeType>(shape.getType());
       llvm::APInt dim = getExtentOp.getDim();
       uint64_t dimVal = dim.getLimitedValue(shapeTy.getRank());
       mlir::Value extent = s.getExtents()[dimVal];
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
index 0142fb0cfb0b..e9dbb7095d0e 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
@@ -185,7 +185,7 @@ protected:
 // the width for use in runtime intrinsic calls.
 static unsigned getKindForType(mlir::Type ty) {
   mlir::Type eltty = hlfir::getFortranElementType(ty);
-  unsigned width = eltty.cast<mlir::IntegerType>().getWidth();
+  unsigned width = mlir::cast<mlir::IntegerType>(eltty).getWidth();
   return width / 8;
 }
 
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
index 84101353a740..63b52c0cd0bc 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
@@ -1090,7 +1090,7 @@ void OrderedAssignmentRewriter::generateSaveEntity(
         mlir::Value loopExtent =
             computeLoopNestIterationNumber(loc, builder, loopNest);
         auto sequenceType =
-            builder.getVarLenSeqTy(entityType).cast<fir::SequenceType>();
+            mlir::cast<fir::SequenceType>(builder.getVarLenSeqTy(entityType));
         temp = insertSavedEntity(region,
                                  fir::factory::HomogeneousScalarStack{
                                      loc, builder, sequenceType, loopExtent,
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
index 685c73d67625..8d68c7021608 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
@@ -249,7 +249,7 @@ static bool areIdenticalOrDisjointSlices(mlir::Value ref1, mlir::Value ref2) {
     auto isPositiveConstant = [](mlir::Value v) -> bool {
       if (auto conOp =
               mlir::dyn_cast<mlir::arith::ConstantOp>(v.getDefiningOp()))
-        if (auto iattr = conOp.getValue().dyn_cast<mlir::IntegerAttr>())
+        if (auto iattr = mlir::dyn_cast<mlir::IntegerAttr>(conOp.getValue()))
           return iattr.getInt() > 0;
       return false;
     };
@@ -601,7 +601,7 @@ mlir::LogicalResult VariableAssignBufferization::matchAndRewrite(
   // TODO: ExprType check is here to avoid conflicts with
   // ElementalAssignBufferization pattern. We need to combine
   // these matchers into a single one that applies to AssignOp.
-  if (rhs.getType().isa<hlfir::ExprType>())
+  if (mlir::isa<hlfir::ExprType>(rhs.getType()))
     return rewriter.notifyMatchFailure(assign, "RHS is not in memory");
 
   if (!rhs.isArray())
@@ -834,7 +834,7 @@ public:
 
     unsigned rank = mlir::cast<hlfir::ExprType>(mloc.getType()).getShape()[0];
     mlir::Type arrayType = array.getType();
-    if (!arrayType.isa<fir::BoxType>())
+    if (!mlir::isa<fir::BoxType>(arrayType))
       return rewriter.notifyMatchFailure(
           mloc, "Currently requires a boxed type input");
     mlir::Type elementType = hlfir::getFortranElementType(arrayType);
@@ -850,7 +850,7 @@ public:
 
     auto init = [isMax](fir::FirOpBuilder builder, mlir::Location loc,
                         mlir::Type elementType) {
-      if (auto ty = elementType.dyn_cast<mlir::FloatType>()) {
+      if (auto ty = mlir::dyn_cast<mlir::FloatType>(elementType)) {
         const llvm::fltSemantics &sem = ty.getFloatSemantics();
         llvm::APFloat limit = llvm::APFloat::getInf(sem, /*Negative=*/isMax);
         return builder.createRealConstant(loc, elementType, limit);
@@ -901,7 +901,7 @@ public:
 
       // Compare with the max reduction value
       mlir::Value cmp;
-      if (elementType.isa<mlir::FloatType>()) {
+      if (mlir::isa<mlir::FloatType>(elementType)) {
         // For FP reductions we want the first smallest value to be used, that
         // is not NaN. A OGL/OLT condition will usually work for this unless all
         // the values are Nan or Inf. This follows the same logic as
@@ -918,7 +918,7 @@ public:
             loc, mlir::arith::CmpFPredicate::OEQ, elem, elem);
         cmpNan = builder.create<mlir::arith::AndIOp>(loc, cmpNan, cmpNan2);
         cmp = builder.create<mlir::arith::OrIOp>(loc, cmp, cmpNan);
-      } else if (elementType.isa<mlir::IntegerType>()) {
+      } else if (mlir::isa<mlir::IntegerType>(elementType)) {
         cmp = builder.create<mlir::arith::CmpIOp>(
             loc,
             isMax ? mlir::arith::CmpIPredicate::sgt
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
index 2751575ce982..b761563eba0f 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
@@ -103,7 +103,8 @@ public:
     // by hlfir.elemental)
     target.addDynamicallyLegalOp<hlfir::TransposeOp>(
         [](hlfir::TransposeOp transpose) {
-          return transpose.getType().cast<hlfir::ExprType>().isPolymorphic();
+          return mlir::cast<hlfir::ExprType>(transpose.getType())
+              .isPolymorphic();
         });
     target.markUnknownOpDynamicallyLegal(
         [](mlir::Operation *) { return true; });
diff --git a/flang/lib/Optimizer/Transforms/AbstractResult.cpp b/flang/lib/Optimizer/Transforms/AbstractResult.cpp
index eb4dd637bb16..85472cdc5103 100644
--- a/flang/lib/Optimizer/Transforms/AbstractResult.cpp
+++ b/flang/lib/Optimizer/Transforms/AbstractResult.cpp
@@ -65,14 +65,14 @@ static mlir::FunctionType getCPtrFunctionType(mlir::FunctionType funcTy) {
   auto resultType = funcTy.getResult(0);
   assert(fir::isa_builtin_cptr_type(resultType));
   llvm::SmallVector<mlir::Type> outputTypes;
-  auto recTy = resultType.dyn_cast<fir::RecordType>();
+  auto recTy = mlir::dyn_cast<fir::RecordType>(resultType);
   outputTypes.emplace_back(recTy.getTypeList()[0].second);
   return mlir::FunctionType::get(funcTy.getContext(), funcTy.getInputs(),
                                  outputTypes);
 }
 
 static bool mustEmboxResult(mlir::Type resultType, bool shouldBoxResult) {
-  return resultType.isa<fir::SequenceType, fir::RecordType>() &&
+  return mlir::isa<fir::SequenceType, fir::RecordType>(resultType) &&
          shouldBoxResult;
 }
 
@@ -114,7 +114,7 @@ public:
     bool isResultBuiltinCPtr = fir::isa_builtin_cptr_type(result.getType());
     Op newOp;
     if (isResultBuiltinCPtr) {
-      auto recTy = result.getType().template dyn_cast<fir::RecordType>();
+      auto recTy = mlir::dyn_cast<fir::RecordType>(result.getType());
       newResultTypes.emplace_back(recTy.getTypeList()[0].second);
     }
 
@@ -261,7 +261,7 @@ public:
   mlir::LogicalResult
   matchAndRewrite(fir::AddrOfOp addrOf,
                   mlir::PatternRewriter &rewriter) const override {
-    auto oldFuncTy = addrOf.getType().cast<mlir::FunctionType>();
+    auto oldFuncTy = mlir::cast<mlir::FunctionType>(addrOf.getType());
     mlir::FunctionType newFuncTy;
     // TODO: This should be generalized for derived types, and it is
     // architecture and OS dependent.
@@ -296,7 +296,7 @@ public:
     auto loc = func.getLoc();
     auto *context = &getContext();
     // Convert function type itself if it has an abstract result.
-    auto funcTy = func.getFunctionType().cast<mlir::FunctionType>();
+    auto funcTy = mlir::cast<mlir::FunctionType>(func.getFunctionType());
     if (hasAbstractResult(funcTy)) {
       // TODO: This should be generalized for derived types, and it is
       // architecture and OS dependent.
@@ -343,11 +343,11 @@ public:
     return mlir::TypeSwitch<mlir::Type, bool>(type)
         .Case([](fir::BoxProcType boxProc) {
           return fir::hasAbstractResult(
-              boxProc.getEleTy().cast<mlir::FunctionType>());
+              mlir::cast<mlir::FunctionType>(boxProc.getEleTy()));
         })
         .Case([](fir::PointerType pointer) {
           return fir::hasAbstractResult(
-              pointer.getEleTy().cast<mlir::FunctionType>());
+              mlir::cast<mlir::FunctionType>(pointer.getEleTy()));
         })
         .Default([](auto &&) { return false; });
   }
@@ -411,7 +411,7 @@ public:
       return !hasAbstractResult(call.getFunctionType());
     });
     target.addDynamicallyLegalOp<fir::AddrOfOp>([](fir::AddrOfOp addrOf) {
-      if (auto funTy = addrOf.getType().dyn_cast<mlir::FunctionType>())
+      if (auto funTy = mlir::dyn_cast<mlir::FunctionType>(addrOf.getType()))
         return !hasAbstractResult(funTy);
       return true;
     });
diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index 68584bef055b..18d98a11ef3c 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -69,7 +69,7 @@ void AddDebugInfoPass::runOnOperation() {
   // In that case, 'inputFilename' may be empty. Location embedded in the
   // module will be used to get file name and its directory.
   if (inputFilename.empty()) {
-    if (auto fileLoc = module.getLoc().dyn_cast<mlir::FileLineColLoc>()) {
+    if (auto fileLoc = mlir::dyn_cast<mlir::FileLineColLoc>(module.getLoc())) {
       fileName = llvm::sys::path::filename(fileLoc.getFilename().getValue());
       filePath = llvm::sys::path::parent_path(fileLoc.getFilename().getValue());
     } else
@@ -94,14 +94,14 @@ void AddDebugInfoPass::runOnOperation() {
     mlir::Location l = funcOp->getLoc();
     // If fused location has already been created then nothing to do
     // Otherwise, create a fused location.
-    if (l.dyn_cast<mlir::FusedLoc>())
+    if (mlir::dyn_cast<mlir::FusedLoc>(l))
       return;
 
     unsigned int CC = (funcOp.getName() == fir::NameUniquer::doProgramEntry())
                           ? llvm::dwarf::getCallingConvention("DW_CC_program")
                           : llvm::dwarf::getCallingConvention("DW_CC_normal");
 
-    if (auto funcLoc = l.dyn_cast<mlir::FileLineColLoc>()) {
+    if (auto funcLoc = mlir::dyn_cast<mlir::FileLineColLoc>(l)) {
       fileName = llvm::sys::path::filename(funcLoc.getFilename().getValue());
       filePath = llvm::sys::path::parent_path(funcLoc.getFilename().getValue());
     }
diff --git a/flang/lib/Optimizer/Transforms/AffineDemotion.cpp b/flang/lib/Optimizer/Transforms/AffineDemotion.cpp
index da29ae880700..b4523a060f5a 100644
--- a/flang/lib/Optimizer/Transforms/AffineDemotion.cpp
+++ b/flang/lib/Optimizer/Transforms/AffineDemotion.cpp
@@ -98,14 +98,15 @@ public:
   mlir::LogicalResult
   matchAndRewrite(fir::ConvertOp op,
                   mlir::PatternRewriter &rewriter) const override {
-    if (op.getRes().getType().isa<mlir::MemRefType>()) {
+    if (mlir::isa<mlir::MemRefType>(op.getRes().getType())) {
       // due to index calculation moving to affine maps we still need to
       // add converts for sequence types this has a side effect of losing
       // some information about arrays with known dimensions by creating:
       // fir.convert %arg0 : (!fir.ref<!fir.array<5xi32>>) ->
       // !fir.ref<!fir.array<?xi32>>
-      if (auto refTy = op.getValue().getType().dyn_cast<fir::ReferenceType>())
-        if (auto arrTy = refTy.getEleTy().dyn_cast<fir::SequenceType>()) {
+      if (auto refTy =
+              mlir::dyn_cast<fir::ReferenceType>(op.getValue().getType()))
+        if (auto arrTy = mlir::dyn_cast<fir::SequenceType>(refTy.getEleTy())) {
           fir::SequenceType::Shape flatShape = {
               fir::SequenceType::getUnknownExtent()};
           auto flatArrTy = fir::SequenceType::get(flatShape, arrTy.getEleTy());
@@ -158,7 +159,7 @@ public:
     mlir::ConversionTarget target(*context);
     target.addIllegalOp<memref::AllocOp>();
     target.addDynamicallyLegalOp<fir::ConvertOp>([](fir::ConvertOp op) {
-      if (op.getRes().getType().isa<mlir::MemRefType>())
+      if (mlir::isa<mlir::MemRefType>(op.getRes().getType()))
         return false;
       return true;
     });
diff --git a/flang/lib/Optimizer/Transforms/AffinePromotion.cpp b/flang/lib/Optimizer/Transforms/AffinePromotion.cpp
index 64531cb1868e..7d0131ac6fa4 100644
--- a/flang/lib/Optimizer/Transforms/AffinePromotion.cpp
+++ b/flang/lib/Optimizer/Transforms/AffinePromotion.cpp
@@ -111,7 +111,7 @@ private:
 
   bool analyzeReference(mlir::Value memref, mlir::Operation *op) {
     if (auto acoOp = memref.getDefiningOp<ArrayCoorOp>()) {
-      if (acoOp.getMemref().getType().isa<fir::BoxType>()) {
+      if (mlir::isa<fir::BoxType>(acoOp.getMemref().getType())) {
         // TODO: Look if and how fir.box can be promoted to affine.
         LLVM_DEBUG(llvm::dbgs() << "AffineLoopAnalysis: cannot promote loop, "
                                    "array memory operation uses fir.box\n";
@@ -222,7 +222,7 @@ private:
       return affineBinaryOp(mlir::AffineExprKind::Mod, op.getLhs(),
                             op.getRhs());
     if (auto op = value.getDefiningOp<mlir::arith::ConstantOp>())
-      if (auto intConstant = op.getValue().dyn_cast<IntegerAttr>())
+      if (auto intConstant = mlir::dyn_cast<IntegerAttr>(op.getValue()))
         return toAffineExpr(intConstant.getInt());
     if (auto blockArg = mlir::dyn_cast<mlir::BlockArgument>(value)) {
       affineArgs.push_back(value);
@@ -331,15 +331,16 @@ static mlir::AffineMap createArrayIndexAffineMap(unsigned dimensions,
 
 static std::optional<int64_t> constantIntegerLike(const mlir::Value value) {
   if (auto definition = value.getDefiningOp<mlir::arith::ConstantOp>())
-    if (auto stepAttr = definition.getValue().dyn_cast<IntegerAttr>())
+    if (auto stepAttr = mlir::dyn_cast<IntegerAttr>(definition.getValue()))
       return stepAttr.getInt();
   return {};
 }
 
 static mlir::Type coordinateArrayElement(fir::ArrayCoorOp op) {
   if (auto refType =
-          op.getMemref().getType().dyn_cast_or_null<ReferenceType>()) {
-    if (auto seqType = refType.getEleTy().dyn_cast_or_null<SequenceType>()) {
+          mlir::dyn_cast_or_null<ReferenceType>(op.getMemref().getType())) {
+    if (auto seqType =
+            mlir::dyn_cast_or_null<SequenceType>(refType.getEleTy())) {
       return seqType.getEleTy();
     }
   }
diff --git a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
index a08d58383d3a..ebc186222525 100644
--- a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
+++ b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
@@ -461,9 +461,9 @@ void ArrayCopyAnalysisBase::arrayMentions(
 }
 
 static bool hasPointerType(mlir::Type type) {
-  if (auto boxTy = type.dyn_cast<BoxType>())
+  if (auto boxTy = mlir::dyn_cast<BoxType>(type))
     type = boxTy.getEleTy();
-  return type.isa<fir::PointerType>();
+  return mlir::isa<fir::PointerType>(type);
 }
 
 // This is a NF performance hack. It makes a simple test that the slices of the
@@ -512,7 +512,7 @@ static bool mutuallyExclusiveSliceRange(ArrayLoadOp ld, ArrayMergeStoreOp st) {
     auto isPositiveConstant = [](mlir::Value v) -> bool {
       if (auto conOp =
               mlir::dyn_cast<mlir::arith::ConstantOp>(v.getDefiningOp()))
-        if (auto iattr = conOp.getValue().dyn_cast<mlir::IntegerAttr>())
+        if (auto iattr = mlir::dyn_cast<mlir::IntegerAttr>(conOp.getValue()))
           return iattr.getInt() > 0;
       return false;
     };
@@ -725,8 +725,8 @@ static bool
 conservativeCallConflict(llvm::ArrayRef<mlir::Operation *> reaches) {
   return llvm::any_of(reaches, [](mlir::Operation *op) {
     if (auto call = mlir::dyn_cast<fir::CallOp>(op))
-      if (auto callee =
-              call.getCallableForCallee().dyn_cast<mlir::SymbolRefAttr>()) {
+      if (auto callee = mlir::dyn_cast<mlir::SymbolRefAttr>(
+              call.getCallableForCallee())) {
         auto module = op->getParentOfType<mlir::ModuleOp>();
         return isInternalProcedure(
             module.lookupSymbol<mlir::func::FuncOp>(callee));
@@ -891,9 +891,9 @@ static mlir::Value getOrReadExtentsAndShapeOp(
   if (arrLoad->hasAttr(fir::getOptionalAttrName()))
     fir::emitFatalError(
         loc, "shapes from array load of OPTIONAL arrays must not be used");
-  if (auto boxTy = arrLoad.getMemref().getType().dyn_cast<BoxType>()) {
+  if (auto boxTy = mlir::dyn_cast<BoxType>(arrLoad.getMemref().getType())) {
     auto rank =
-        dyn_cast_ptrOrBoxEleTy(boxTy).cast<SequenceType>().getDimension();
+        mlir::cast<SequenceType>(dyn_cast_ptrOrBoxEleTy(boxTy)).getDimension();
     auto idxTy = rewriter.getIndexType();
     for (decltype(rank) dim = 0; dim < rank; ++dim) {
       auto dimVal = rewriter.create<mlir::arith::ConstantIndexOp>(loc, dim);
@@ -929,7 +929,7 @@ static mlir::Type toRefType(mlir::Type ty) {
 static llvm::SmallVector<mlir::Value>
 getTypeParamsIfRawData(mlir::Location loc, FirOpBuilder &builder,
                        ArrayLoadOp arrLoad, mlir::Type ty) {
-  if (ty.isa<BoxType>())
+  if (mlir::isa<BoxType>(ty))
     return {};
   return fir::factory::getTypeParams(loc, builder, arrLoad);
 }
@@ -947,8 +947,8 @@ static mlir::Value genCoorOp(mlir::PatternRewriter &rewriter,
     originated = factory::originateIndices(loc, rewriter, alloc.getType(),
                                            shape, indices);
   auto seqTy = dyn_cast_ptrOrBoxEleTy(alloc.getType());
-  assert(seqTy && seqTy.isa<SequenceType>());
-  const auto dimension = seqTy.cast<SequenceType>().getDimension();
+  assert(seqTy && mlir::isa<SequenceType>(seqTy));
+  const auto dimension = mlir::cast<SequenceType>(seqTy).getDimension();
   auto module = load->getParentOfType<mlir::ModuleOp>();
   FirOpBuilder builder(rewriter, module);
   auto typeparams = getTypeParamsIfRawData(loc, builder, load, alloc.getType());
@@ -967,7 +967,7 @@ static mlir::Value getCharacterLen(mlir::Location loc, FirOpBuilder &builder,
                                    ArrayLoadOp load, CharacterType charTy) {
   auto charLenTy = builder.getCharacterLengthType();
   if (charTy.hasDynamicLen()) {
-    if (load.getMemref().getType().isa<BoxType>()) {
+    if (mlir::isa<BoxType>(load.getMemref().getType())) {
       // The loaded array is an emboxed value. Get the CHARACTER length from
       // the box value.
       auto eleSzInBytes =
@@ -1027,7 +1027,7 @@ void genArrayCopy(mlir::Location loc, mlir::PatternRewriter &rewriter,
       getTypeParamsIfRawData(loc, builder, arrLoad, dst.getType()));
   auto eleTy = unwrapSequenceType(unwrapPassByRefType(dst.getType()));
   // Copy from (to) object to (from) temp copy of same object.
-  if (auto charTy = eleTy.dyn_cast<CharacterType>()) {
+  if (auto charTy = mlir::dyn_cast<CharacterType>(eleTy)) {
     auto len = getCharacterLen(loc, builder, arrLoad, charTy);
     CharBoxValue toChar(toAddr, len);
     CharBoxValue fromChar(fromAddr, len);
@@ -1049,8 +1049,8 @@ genArrayLoadTypeParameters(mlir::Location loc, mlir::PatternRewriter &rewriter,
     auto eleTy =
         unwrapSequenceType(unwrapPassByRefType(load.getMemref().getType()));
     if (hasDynamicSize(eleTy)) {
-      if (auto charTy = eleTy.dyn_cast<CharacterType>()) {
-        assert(load.getMemref().getType().isa<BoxType>());
+      if (auto charTy = mlir::dyn_cast<CharacterType>(eleTy)) {
+        assert(mlir::isa<BoxType>(load.getMemref().getType()));
         auto module = load->getParentOfType<mlir::ModuleOp>();
         FirOpBuilder builder(rewriter, module);
         return {getCharacterLen(loc, builder, load, charTy)};
@@ -1067,7 +1067,7 @@ findNonconstantExtents(mlir::Type memrefTy,
                        llvm::ArrayRef<mlir::Value> extents) {
   llvm::SmallVector<mlir::Value> nce;
   auto arrTy = unwrapPassByRefType(memrefTy);
-  auto seqTy = arrTy.cast<SequenceType>();
+  auto seqTy = mlir::cast<SequenceType>(arrTy);
   for (auto [s, x] : llvm::zip(seqTy.getShape(), extents))
     if (s == SequenceType::getUnknownExtent())
       nce.emplace_back(x);
diff --git a/flang/lib/Optimizer/Transforms/CharacterConversion.cpp b/flang/lib/Optimizer/Transforms/CharacterConversion.cpp
index 87ea72dbca9b..44baad73aa25 100644
--- a/flang/lib/Optimizer/Transforms/CharacterConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CharacterConversion.cpp
@@ -60,8 +60,8 @@ public:
     // For each code point in the `from` string, convert naively to the `to`
     // string code point. Conversion is done blindly on size only, not value.
     auto getCharBits = [&](mlir::Type t) {
-      auto chrTy = fir::unwrapSequenceType(fir::dyn_cast_ptrEleTy(t))
-                       .cast<fir::CharacterType>();
+      auto chrTy = mlir::cast<fir::CharacterType>(
+          fir::unwrapSequenceType(fir::dyn_cast_ptrEleTy(t)));
       return kindMap.getCharacterBitsize(chrTy.getFKind());
     };
     auto fromBits = getCharBits(conv.getFrom().getType());
diff --git a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
index 7cbd2dd1f897..38cdc2b1388d 100644
--- a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
+++ b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
@@ -147,7 +147,7 @@ struct ArgsUsageInLoop {
 
 static fir::SequenceType getAsSequenceType(mlir::Value *v) {
   mlir::Type argTy = fir::unwrapPassByRefType(fir::unwrapRefType(v->getType()));
-  return argTy.dyn_cast<fir::SequenceType>();
+  return mlir::dyn_cast<fir::SequenceType>(argTy);
 }
 
 /// if a value comes from a fir.declare, follow it to the original source,
@@ -556,7 +556,3 @@ void LoopVersioningPass::runOnOperation() {
 
   LLVM_DEBUG(llvm::dbgs() << "=== End " DEBUG_TYPE " ===\n");
 }
-
-std::unique_ptr<mlir::Pass> fir::createLoopVersioningPass() {
-  return std::make_unique<LoopVersioningPass>();
-}
diff --git a/flang/lib/Optimizer/Transforms/MemoryAllocation.cpp b/flang/lib/Optimizer/Transforms/MemoryAllocation.cpp
index 166a6b10def2..ada67b4201e1 100644
--- a/flang/lib/Optimizer/Transforms/MemoryAllocation.cpp
+++ b/flang/lib/Optimizer/Transforms/MemoryAllocation.cpp
@@ -28,17 +28,6 @@ namespace fir {
 static constexpr std::size_t unlimitedArraySize = ~static_cast<std::size_t>(0);
 
 namespace {
-struct MemoryAllocationOptions {
-  // Always move dynamic array allocations to the heap. This may result in more
-  // heap fragmentation, so may impact performance negatively.
-  bool dynamicArrayOnHeap = false;
-
-  // Number of elements in array threshold for moving to heap. In environments
-  // with limited stack size, moving large arrays to the heap can avoid running
-  // out of stack space.
-  std::size_t maxStackArraySize = unlimitedArraySize;
-};
-
 class ReturnAnalysis {
 public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ReturnAnalysis)
@@ -68,14 +57,15 @@ private:
 
 /// Return `true` if this allocation is to remain on the stack (`fir.alloca`).
 /// Otherwise the allocation should be moved to the heap (`fir.allocmem`).
-static inline bool keepStackAllocation(fir::AllocaOp alloca, mlir::Block *entry,
-                                       const MemoryAllocationOptions &options) {
+static inline bool
+keepStackAllocation(fir::AllocaOp alloca, mlir::Block *entry,
+                    const fir::MemoryAllocationOptOptions &options) {
   // Limitation: only arrays allocated on the stack in the entry block are
   // considered for now.
   // TODO: Generalize the algorithm and placement of the freemem nodes.
   if (alloca->getBlock() != entry)
     return true;
-  if (auto seqTy = alloca.getInType().dyn_cast<fir::SequenceType>()) {
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(alloca.getInType())) {
     if (fir::hasDynamicSize(seqTy)) {
       // Move all arrays with runtime determined size to the heap.
       if (options.dynamicArrayOnHeap)
@@ -168,6 +158,9 @@ public:
     options = {dynOnHeap, maxStackSize};
   }
 
+  MemoryAllocationOpt(const fir::MemoryAllocationOptOptions &options)
+      : options{options} {}
+
   /// Override `options` if command-line options have been set.
   inline void useCommandLineOptions() {
     if (dynamicArrayOnHeap)
@@ -211,15 +204,6 @@ public:
   }
 
 private:
-  MemoryAllocationOptions options;
+  fir::MemoryAllocationOptOptions options;
 };
 } // namespace
-
-std::unique_ptr<mlir::Pass> fir::createMemoryAllocationPass() {
-  return std::make_unique<MemoryAllocationOpt>();
-}
-
-std::unique_ptr<mlir::Pass>
-fir::createMemoryAllocationPass(bool dynOnHeap, std::size_t maxStackSize) {
-  return std::make_unique<MemoryAllocationOpt>(dynOnHeap, maxStackSize);
-}
diff --git a/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp b/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
index 93efea434cb1..d933dc58f375 100644
--- a/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
@@ -97,8 +97,8 @@ struct DispatchOpConv : public OpConversionPattern<fir::DispatchOp> {
     // Get derived type information.
     mlir::Type declaredType =
         fir::getDerivedType(dispatch.getObject().getType().getEleTy());
-    assert(declaredType.isa<fir::RecordType>() && "expecting fir.type");
-    auto recordType = declaredType.dyn_cast<fir::RecordType>();
+    assert(mlir::isa<fir::RecordType>(declaredType) && "expecting fir.type");
+    auto recordType = mlir::dyn_cast<fir::RecordType>(declaredType);
 
     // Lookup for the binding table.
     auto bindingsIter = bindingTables.find(recordType.getName());
@@ -157,7 +157,7 @@ struct DispatchOpConv : public OpConversionPattern<fir::DispatchOp> {
 
     // Load the bindings descriptor.
     auto bindingsCompName = Fortran::semantics::bindingDescCompName;
-    fir::RecordType typeDescRecTy = typeDescTy.cast<fir::RecordType>();
+    fir::RecordType typeDescRecTy = mlir::cast<fir::RecordType>(typeDescTy);
     mlir::Value field = rewriter.create<fir::FieldIndexOp>(
         loc, fieldTy, bindingsCompName, typeDescRecTy, mlir::ValueRange{});
     mlir::Type coorTy =
@@ -168,8 +168,8 @@ struct DispatchOpConv : public OpConversionPattern<fir::DispatchOp> {
 
     // Load the correct binding.
     mlir::Value bindings = rewriter.create<fir::BoxAddrOp>(loc, bindingBox);
-    fir::RecordType bindingTy =
-        fir::unwrapIfDerived(bindingBox.getType().cast<fir::BaseBoxType>());
+    fir::RecordType bindingTy = fir::unwrapIfDerived(
+        mlir::cast<fir::BaseBoxType>(bindingBox.getType()));
     mlir::Type bindingAddrTy = fir::ReferenceType::get(bindingTy);
     mlir::Value bindingIdxVal = rewriter.create<mlir::arith::ConstantOp>(
         loc, rewriter.getIndexType(), rewriter.getIndexAttr(bindingIdx));
@@ -181,7 +181,7 @@ struct DispatchOpConv : public OpConversionPattern<fir::DispatchOp> {
     mlir::Value procField = rewriter.create<fir::FieldIndexOp>(
         loc, fieldTy, procCompName, bindingTy, mlir::ValueRange{});
     fir::RecordType procTy =
-        bindingTy.getType(procCompName).cast<fir::RecordType>();
+        mlir::cast<fir::RecordType>(bindingTy.getType(procCompName));
     mlir::Type procRefTy = fir::ReferenceType::get(procTy);
     mlir::Value procRef = rewriter.create<fir::CoordinateOp>(
         loc, procRefTy, bindingAddr, procField);
@@ -298,13 +298,13 @@ mlir::LogicalResult SelectTypeConv::matchAndRewrite(
   //   before in the list to respect point 3. above. Otherwise it is just
   //   added in order at the end.
   for (unsigned t = 0; t < typeGuardNum; ++t) {
-    if (auto a = typeGuards[t].dyn_cast<fir::ExactTypeAttr>()) {
+    if (auto a = mlir::dyn_cast<fir::ExactTypeAttr>(typeGuards[t])) {
       orderedTypeGuards.push_back(t);
       continue;
     }
 
-    if (auto a = typeGuards[t].dyn_cast<fir::SubclassAttr>()) {
-      if (auto recTy = a.getType().dyn_cast<fir::RecordType>()) {
+    if (auto a = mlir::dyn_cast<fir::SubclassAttr>(typeGuards[t])) {
+      if (auto recTy = mlir::dyn_cast<fir::RecordType>(a.getType())) {
         auto dt = mod.lookupSymbol<fir::TypeInfoOp>(recTy.getName());
         assert(dt && "dispatch table not found");
         llvm::SmallSet<llvm::StringRef, 4> ancestors =
@@ -313,8 +313,8 @@ mlir::LogicalResult SelectTypeConv::matchAndRewrite(
           auto it = orderedClassIsGuards.begin();
           while (it != orderedClassIsGuards.end()) {
             fir::SubclassAttr sAttr =
-                typeGuards[*it].dyn_cast<fir::SubclassAttr>();
-            if (auto ty = sAttr.getType().dyn_cast<fir::RecordType>()) {
+                mlir::dyn_cast<fir::SubclassAttr>(typeGuards[*it]);
+            if (auto ty = mlir::dyn_cast<fir::RecordType>(sAttr.getType())) {
               if (ancestors.contains(ty.getName()))
                 break;
             }
@@ -339,7 +339,7 @@ mlir::LogicalResult SelectTypeConv::matchAndRewrite(
     auto *dest = selectType.getSuccessor(idx);
     std::optional<mlir::ValueRange> destOps =
         selectType.getSuccessorOperands(operands, idx);
-    if (typeGuards[idx].dyn_cast<mlir::UnitAttr>())
+    if (mlir::dyn_cast<mlir::UnitAttr>(typeGuards[idx]))
       rewriter.replaceOpWithNewOp<mlir::cf::BranchOp>(
           selectType, dest, destOps.value_or(mlir::ValueRange{}));
     else if (mlir::failed(genTypeLadderStep(loc, selector, typeGuards[idx],
@@ -357,9 +357,9 @@ mlir::LogicalResult SelectTypeConv::genTypeLadderStep(
     fir::KindMapping &kindMap) const {
   mlir::Value cmp;
   // TYPE IS type guard comparison are all done inlined.
-  if (auto a = attr.dyn_cast<fir::ExactTypeAttr>()) {
+  if (auto a = mlir::dyn_cast<fir::ExactTypeAttr>(attr)) {
     if (fir::isa_trivial(a.getType()) ||
-        a.getType().isa<fir::CharacterType>()) {
+        mlir::isa<fir::CharacterType>(a.getType())) {
       // For type guard statement with Intrinsic type spec the type code of
       // the descriptor is compared.
       int code = fir::getTypeCode(a.getType(), kindMap);
@@ -383,10 +383,10 @@ mlir::LogicalResult SelectTypeConv::genTypeLadderStep(
       cmp = res;
     }
     // CLASS IS type guard statement is done with a runtime call.
-  } else if (auto a = attr.dyn_cast<fir::SubclassAttr>()) {
+  } else if (auto a = mlir::dyn_cast<fir::SubclassAttr>(attr)) {
     // Retrieve the type descriptor from the type guard statement record type.
-    assert(a.getType().isa<fir::RecordType>() && "expect fir.record type");
-    fir::RecordType recTy = a.getType().dyn_cast<fir::RecordType>();
+    assert(mlir::isa<fir::RecordType>(a.getType()) && "expect fir.record type");
+    fir::RecordType recTy = mlir::dyn_cast<fir::RecordType>(a.getType());
     std::string typeDescName =
         fir::NameUniquer::getTypeDescriptorName(recTy.getName());
     auto typeDescGlobal = mod.lookupSymbol<fir::GlobalOp>(typeDescName);
@@ -438,8 +438,8 @@ mlir::Value
 SelectTypeConv::genTypeDescCompare(mlir::Location loc, mlir::Value selector,
                                    mlir::Type ty, mlir::ModuleOp mod,
                                    mlir::PatternRewriter &rewriter) const {
-  assert(ty.isa<fir::RecordType>() && "expect fir.record type");
-  fir::RecordType recTy = ty.dyn_cast<fir::RecordType>();
+  assert(mlir::isa<fir::RecordType>(ty) && "expect fir.record type");
+  fir::RecordType recTy = mlir::dyn_cast<fir::RecordType>(ty);
   std::string typeDescName =
       fir::NameUniquer::getTypeDescriptorName(recTy.getName());
   auto typeDescGlobal = mod.lookupSymbol<fir::GlobalOp>(typeDescName);
diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
index a4f2f5238e40..c61179a7460e 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
@@ -215,8 +215,8 @@ static unsigned getDimCount(mlir::Value val) {
   // the first ConvertOp that has non-opaque box type that we meet
   // going through the ConvertOp chain.
   if (mlir::Value emboxVal = findBoxDef(val))
-    if (auto boxTy = emboxVal.getType().dyn_cast<fir::BoxType>())
-      if (auto seqTy = boxTy.getEleTy().dyn_cast<fir::SequenceType>())
+    if (auto boxTy = mlir::dyn_cast<fir::BoxType>(emboxVal.getType()))
+      if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(boxTy.getEleTy()))
         return seqTy.getDimension();
   return 0;
 }
@@ -237,9 +237,9 @@ static std::optional<mlir::Type> getArgElementType(mlir::Value val) {
     val = defOp->getOperand(0);
     // The convert operation is expected to convert from one
     // box type to another box type.
-    auto boxType = val.getType().cast<fir::BoxType>();
+    auto boxType = mlir::cast<fir::BoxType>(val.getType());
     auto elementType = fir::unwrapSeqOrBoxedSeqType(boxType);
-    if (!elementType.isa<mlir::NoneType>())
+    if (!mlir::isa<mlir::NoneType>(elementType))
       return elementType;
   } while (true);
 }
@@ -381,7 +381,7 @@ static void genRuntimeSumBody(fir::FirOpBuilder &builder,
   // end function RTNAME(Sum)<T>x<rank>_simplified
   auto zero = [](fir::FirOpBuilder builder, mlir::Location loc,
                  mlir::Type elementType) {
-    if (auto ty = elementType.dyn_cast<mlir::FloatType>()) {
+    if (auto ty = mlir::dyn_cast<mlir::FloatType>(elementType)) {
       const llvm::fltSemantics &sem = ty.getFloatSemantics();
       return builder.createRealConstant(loc, elementType,
                                         llvm::APFloat::getZero(sem));
@@ -392,9 +392,9 @@ static void genRuntimeSumBody(fir::FirOpBuilder &builder,
   auto genBodyOp = [](fir::FirOpBuilder builder, mlir::Location loc,
                       mlir::Type elementType, mlir::Value elem1,
                       mlir::Value elem2) -> mlir::Value {
-    if (elementType.isa<mlir::FloatType>())
+    if (mlir::isa<mlir::FloatType>(elementType))
       return builder.create<mlir::arith::AddFOp>(loc, elem1, elem2);
-    if (elementType.isa<mlir::IntegerType>())
+    if (mlir::isa<mlir::IntegerType>(elementType))
       return builder.create<mlir::arith::AddIOp>(loc, elem1, elem2);
 
     llvm_unreachable("unsupported type");
@@ -414,7 +414,7 @@ static void genRuntimeMaxvalBody(fir::FirOpBuilder &builder,
                                  mlir::Type elementType) {
   auto init = [](fir::FirOpBuilder builder, mlir::Location loc,
                  mlir::Type elementType) {
-    if (auto ty = elementType.dyn_cast<mlir::FloatType>()) {
+    if (auto ty = mlir::dyn_cast<mlir::FloatType>(elementType)) {
       const llvm::fltSemantics &sem = ty.getFloatSemantics();
       return builder.createRealConstant(
           loc, elementType, llvm::APFloat::getLargest(sem, /*Negative=*/true));
@@ -427,7 +427,7 @@ static void genRuntimeMaxvalBody(fir::FirOpBuilder &builder,
   auto genBodyOp = [](fir::FirOpBuilder builder, mlir::Location loc,
                       mlir::Type elementType, mlir::Value elem1,
                       mlir::Value elem2) -> mlir::Value {
-    if (elementType.isa<mlir::FloatType>()) {
+    if (mlir::isa<mlir::FloatType>(elementType)) {
       // arith.maxf later converted to llvm.intr.maxnum does not work
       // correctly for NaNs and -0.0 (see maxnum/minnum pattern matching
       // in LLVM's InstCombine pass). Moreover, llvm.intr.maxnum
@@ -439,7 +439,7 @@ static void genRuntimeMaxvalBody(fir::FirOpBuilder &builder,
           loc, mlir::arith::CmpFPredicate::OGT, elem1, elem2);
       return builder.create<mlir::arith::SelectOp>(loc, compare, elem1, elem2);
     }
-    if (elementType.isa<mlir::IntegerType>())
+    if (mlir::isa<mlir::IntegerType>(elementType))
       return builder.create<mlir::arith::MaxSIOp>(loc, elem1, elem2);
 
     llvm_unreachable("unsupported type");
@@ -662,7 +662,7 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
                                     mlir::Type resultElemTy, bool isDim) {
   auto init = [isMax](fir::FirOpBuilder builder, mlir::Location loc,
                       mlir::Type elementType) {
-    if (auto ty = elementType.dyn_cast<mlir::FloatType>()) {
+    if (auto ty = mlir::dyn_cast<mlir::FloatType>(elementType)) {
       const llvm::fltSemantics &sem = ty.getFloatSemantics();
       llvm::APFloat limit = llvm::APFloat::getInf(sem, /*Negative=*/isMax);
       return builder.createRealConstant(loc, elementType, limit);
@@ -744,7 +744,7 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
     mlir::Value elem = builder.create<fir::LoadOp>(loc, addr);
 
     mlir::Value cmp;
-    if (elementType.isa<mlir::FloatType>()) {
+    if (mlir::isa<mlir::FloatType>(elementType)) {
       // For FP reductions we want the first smallest value to be used, that
       // is not NaN. A OGL/OLT condition will usually work for this unless all
       // the values are Nan or Inf. This follows the same logic as
@@ -761,7 +761,7 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
           loc, mlir::arith::CmpFPredicate::OEQ, elem, elem);
       cmpNan = builder.create<mlir::arith::AndIOp>(loc, cmpNan, cmpNan2);
       cmp = builder.create<mlir::arith::OrIOp>(loc, cmp, cmpNan);
-    } else if (elementType.isa<mlir::IntegerType>()) {
+    } else if (mlir::isa<mlir::IntegerType>(elementType)) {
       cmp = builder.create<mlir::arith::CmpIOp>(
           loc,
           isMax ? mlir::arith::CmpIPredicate::sgt
@@ -839,7 +839,7 @@ static void genRuntimeMinMaxlocBody(fir::FirOpBuilder &builder,
 
     builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
     mlir::Value basicValue;
-    if (elementType.isa<mlir::IntegerType>()) {
+    if (mlir::isa<mlir::IntegerType>(elementType)) {
       basicValue = builder.createIntegerConstant(loc, elementType, 0);
     } else {
       basicValue = builder.createRealConstant(loc, elementType, 0);
@@ -921,7 +921,7 @@ static void genRuntimeDotBody(fir::FirOpBuilder &builder,
   mlir::IndexType idxTy = builder.getIndexType();
 
   mlir::Value zero =
-      resultElementType.isa<mlir::FloatType>()
+      mlir::isa<mlir::FloatType>(resultElementType)
           ? builder.createRealConstant(loc, resultElementType, 0.0)
           : builder.createIntegerConstant(loc, resultElementType, 0);
 
@@ -978,10 +978,10 @@ static void genRuntimeDotBody(fir::FirOpBuilder &builder,
   // Convert to the result type.
   elem2 = builder.create<fir::ConvertOp>(loc, resultElementType, elem2);
 
-  if (resultElementType.isa<mlir::FloatType>())
+  if (mlir::isa<mlir::FloatType>(resultElementType))
     sumVal = builder.create<mlir::arith::AddFOp>(
         loc, builder.create<mlir::arith::MulFOp>(loc, elem1, elem2), sumVal);
-  else if (resultElementType.isa<mlir::IntegerType>())
+  else if (mlir::isa<mlir::IntegerType>(resultElementType))
     sumVal = builder.create<mlir::arith::AddIOp>(
         loc, builder.create<mlir::arith::MulIOp>(loc, elem1, elem2), sumVal);
   else
@@ -1056,8 +1056,8 @@ void SimplifyIntrinsicsPass::simplifyIntOrFloatReduction(
 
   mlir::Type resultType = call.getResult(0).getType();
 
-  if (!resultType.isa<mlir::FloatType>() &&
-      !resultType.isa<mlir::IntegerType>())
+  if (!mlir::isa<mlir::FloatType>(resultType) &&
+      !mlir::isa<mlir::IntegerType>(resultType))
     return;
 
   auto argType = getArgElementType(args[0]);
@@ -1103,7 +1103,8 @@ void SimplifyIntrinsicsPass::simplifyLogicalDim0Reduction(
   fir::FirOpBuilder builder{getSimplificationBuilder(call, kindMap)};
 
   // Treating logicals as integers makes things a lot easier
-  fir::LogicalType logicalType = {elementType.dyn_cast<fir::LogicalType>()};
+  fir::LogicalType logicalType = {
+      mlir::dyn_cast<fir::LogicalType>(elementType)};
   fir::KindTy kind = logicalType.getFKind();
   mlir::Type intElementType = builder.getIntegerType(kind * 8);
 
@@ -1138,7 +1139,8 @@ void SimplifyIntrinsicsPass::simplifyLogicalDim1Reduction(
   fir::FirOpBuilder builder{getSimplificationBuilder(call, kindMap)};
 
   // Treating logicals as integers makes things a lot easier
-  fir::LogicalType logicalType = {elementType.dyn_cast<fir::LogicalType>()};
+  fir::LogicalType logicalType = {
+      mlir::dyn_cast<fir::LogicalType>(elementType)};
   fir::KindTy kind = logicalType.getFKind();
   mlir::Type intElementType = builder.getIntegerType(kind * 8);
 
@@ -1182,7 +1184,7 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
   auto inputBox = findBoxDef(args[1]);
   mlir::Type inputType = hlfir::getFortranElementType(inputBox.getType());
 
-  if (inputType.isa<fir::CharacterType>())
+  if (mlir::isa<fir::CharacterType>(inputType))
     return;
 
   int maskRank;
@@ -1193,7 +1195,8 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
   } else {
     maskRank = getDimCount(mask);
     mlir::Type maskElemTy = hlfir::getFortranElementType(maskDef.getType());
-    fir::LogicalType logicalFirType = {maskElemTy.dyn_cast<fir::LogicalType>()};
+    fir::LogicalType logicalFirType = {
+        mlir::dyn_cast<fir::LogicalType>(maskElemTy)};
     kind = logicalFirType.getFKind();
     // Convert fir::LogicalType to mlir::Type
     logicalElemType = logicalFirType;
@@ -1302,7 +1305,8 @@ void SimplifyIntrinsicsPass::runOnOperation() {
           std::string fmfString{builder.getFastMathFlagsString()};
 
           mlir::Type type = call.getResult(0).getType();
-          if (!type.isa<mlir::FloatType>() && !type.isa<mlir::IntegerType>())
+          if (!mlir::isa<mlir::FloatType>(type) &&
+              !mlir::isa<mlir::IntegerType>(type))
             return;
 
           // Try to find the element types of the boxed arguments.
@@ -1314,11 +1318,9 @@ void SimplifyIntrinsicsPass::runOnOperation() {
 
           // Support only floating point and integer arguments
           // now (e.g. logical is skipped here).
-          if (!arg1Type->isa<mlir::FloatType>() &&
-              !arg1Type->isa<mlir::IntegerType>())
+          if (!mlir::isa<mlir::FloatType, mlir::IntegerType>(*arg1Type))
             return;
-          if (!arg2Type->isa<mlir::FloatType>() &&
-              !arg2Type->isa<mlir::IntegerType>())
+          if (!mlir::isa<mlir::FloatType, mlir::IntegerType>(*arg2Type))
             return;
 
           auto typeGenerator = [&type](fir::FirOpBuilder &builder) {
diff --git a/flang/lib/Optimizer/Transforms/StackArrays.cpp b/flang/lib/Optimizer/Transforms/StackArrays.cpp
index 1c213abefe6f..16bbb1c35646 100644
--- a/flang/lib/Optimizer/Transforms/StackArrays.cpp
+++ b/flang/lib/Optimizer/Transforms/StackArrays.cpp
@@ -351,7 +351,7 @@ void AllocationAnalysis::visitOperation(mlir::Operation *op,
     }
 
     auto retTy = allocmem.getAllocatedType();
-    if (!retTy.isa<fir::SequenceType>()) {
+    if (!mlir::isa<fir::SequenceType>(retTy)) {
       LLVM_DEBUG(llvm::dbgs()
                  << "--Allocation is not for an array: skipping\n");
       return;
@@ -776,7 +776,3 @@ void StackArraysPass::runOnFunc(mlir::Operation *func) {
     signalPassFailure();
   }
 }
-
-std::unique_ptr<mlir::Pass> fir::createStackArraysPass() {
-  return std::make_unique<StackArraysPass>();
-}
diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
index bdd0e07bbfd4..bc81e1b1887b 100644
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -103,7 +103,6 @@ append(${NO_LTO_FLAGS} CMAKE_CXX_FLAGS)
 add_definitions(-U_GLIBCXX_ASSERTIONS)
 add_definitions(-U_LIBCPP_ENABLE_ASSERTIONS)
 
-add_subdirectory(FortranMain)
 add_subdirectory(Float128Math)
 
 set(sources
@@ -193,6 +192,7 @@ set(supported_files
   environment.cpp
   extrema.cpp
   external-unit.cpp
+  file.cpp
   findloc.cpp
   format.cpp
   inquiry.cpp
diff --git a/flang/runtime/FortranMain/CMakeLists.txt b/flang/runtime/FortranMain/CMakeLists.txt
deleted file mode 100644
index deb7bd10acf5..000000000000
--- a/flang/runtime/FortranMain/CMakeLists.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-add_flang_library(Fortran_main STATIC INSTALL_WITH_TOOLCHAIN
-  Fortran_main.c
-)
-if (DEFINED MSVC)
-  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded)
-  add_flang_library(Fortran_main.static STATIC INSTALL_WITH_TOOLCHAIN
-    Fortran_main.c
-  )
-  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDLL)
-  add_flang_library(Fortran_main.dynamic STATIC INSTALL_WITH_TOOLCHAIN
-    Fortran_main.c
-  )
-  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDebug)
-  add_flang_library(Fortran_main.static_dbg STATIC INSTALL_WITH_TOOLCHAIN
-    Fortran_main.c
-  )
-  set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDebugDLL)
-  add_flang_library(Fortran_main.dynamic_dbg STATIC INSTALL_WITH_TOOLCHAIN
-    Fortran_main.c
-  )
-  add_dependencies(Fortran_main Fortran_main.static Fortran_main.dynamic
-    Fortran_main.static_dbg Fortran_main.dynamic_dbg)
-endif()
diff --git a/flang/runtime/FortranMain/Fortran_main.c b/flang/runtime/FortranMain/Fortran_main.c
deleted file mode 100644
index 5d3eaced001e..000000000000
--- a/flang/runtime/FortranMain/Fortran_main.c
+++ /dev/null
@@ -1,23 +0,0 @@
-//===-- runtime/FortranMain/Fortran_main.c --------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "flang/Runtime/main.h"
-#include "flang/Runtime/stop.h"
-
-/* main entry into PROGRAM */
-void _QQmain(void);
-
-extern const struct EnvironmentDefaultList *_QQEnvironmentDefaults;
-
-/* C main stub */
-int main(int argc, const char *argv[], const char *envp[]) {
-  RTNAME(ProgramStart)(argc, argv, envp, _QQEnvironmentDefaults);
-  _QQmain();
-  RTNAME(ProgramEndStatement)();
-  return 0;
-}
diff --git a/flang/runtime/environment.cpp b/flang/runtime/environment.cpp
index b2c9665a28df..52b1d99ba536 100644
--- a/flang/runtime/environment.cpp
+++ b/flang/runtime/environment.cpp
@@ -23,9 +23,11 @@ extern char **environ;
 
 namespace Fortran::runtime {
 
+#ifndef FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
 RT_OFFLOAD_VAR_GROUP_BEGIN
 RT_VAR_ATTRS ExecutionEnvironment executionEnvironment;
 RT_OFFLOAD_VAR_GROUP_END
+#endif // FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
 
 static void SetEnvironmentDefaults(const EnvironmentDefaultList *envDefaults) {
   if (!envDefaults) {
diff --git a/flang/runtime/file.cpp b/flang/runtime/file.cpp
index acd5d33d4bb8..79db17e70acd 100644
--- a/flang/runtime/file.cpp
+++ b/flang/runtime/file.cpp
@@ -457,22 +457,22 @@ std::int64_t SizeInBytes(const char *path) {
   return -1;
 }
 #else // defined(RT_DEVICE_COMPILATION)
-bool IsATerminal(int fd) {
+RT_API_ATTRS bool IsATerminal(int fd) {
   Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
 }
-bool IsExtant(const char *path) {
+RT_API_ATTRS bool IsExtant(const char *path) {
   Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
 }
-bool MayRead(const char *path) {
+RT_API_ATTRS bool MayRead(const char *path) {
   Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
 }
-bool MayWrite(const char *path) {
+RT_API_ATTRS bool MayWrite(const char *path) {
   Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
 }
-bool MayReadAndWrite(const char *path) {
+RT_API_ATTRS bool MayReadAndWrite(const char *path) {
   Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
 }
-std::int64_t SizeInBytes(const char *path) {
+RT_API_ATTRS std::int64_t SizeInBytes(const char *path) {
   Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
 }
 #endif // defined(RT_DEVICE_COMPILATION)
diff --git a/flang/runtime/namelist.cpp b/flang/runtime/namelist.cpp
index b9eed2101ecf..af092de70f78 100644
--- a/flang/runtime/namelist.cpp
+++ b/flang/runtime/namelist.cpp
@@ -596,7 +596,7 @@ bool IODEF(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
   return true;
 }
 
-bool IsNamelistNameOrSlash(IoStatementState &io) {
+RT_API_ATTRS bool IsNamelistNameOrSlash(IoStatementState &io) {
   if (auto *listInput{
           io.get_if<ListDirectedStatementState<Direction::Input>>()}) {
     if (listInput->inNamelistSequence()) {
diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp
index 0e38cffdf907..3b42f45d5588 100644
--- a/flang/runtime/unit.cpp
+++ b/flang/runtime/unit.cpp
@@ -19,11 +19,13 @@
 
 namespace Fortran::runtime::io {
 
+#ifndef FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
 RT_OFFLOAD_VAR_GROUP_BEGIN
 RT_VAR_ATTRS ExternalFileUnit *defaultInput{nullptr}; // unit 5
 RT_VAR_ATTRS ExternalFileUnit *defaultOutput{nullptr}; // unit 6
 RT_VAR_ATTRS ExternalFileUnit *errorOutput{nullptr}; // unit 0 extension
 RT_OFFLOAD_VAR_GROUP_END
+#endif // FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
 
 RT_OFFLOAD_API_GROUP_BEGIN
 
diff --git a/flang/runtime/utf.cpp b/flang/runtime/utf.cpp
index 9945dc6509ec..f4b38d5225ce 100644
--- a/flang/runtime/utf.cpp
+++ b/flang/runtime/utf.cpp
@@ -10,6 +10,7 @@
 
 namespace Fortran::runtime {
 
+#ifndef FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
 // clang-format off
 RT_OFFLOAD_VAR_GROUP_BEGIN
 const RT_CONST_VAR_ATTRS std::uint8_t UTF8FirstByteTable[256]{
@@ -40,6 +41,7 @@ const RT_CONST_VAR_ATTRS std::uint8_t UTF8FirstByteTable[256]{
 };
 RT_OFFLOAD_VAR_GROUP_END
 // clang-format on
+#endif // FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
 
 RT_OFFLOAD_API_GROUP_BEGIN
 // Non-minimal encodings are accepted.
diff --git a/flang/test/CMakeLists.txt b/flang/test/CMakeLists.txt
index 7d96a72e5f36..7e036ad539df 100644
--- a/flang/test/CMakeLists.txt
+++ b/flang/test/CMakeLists.txt
@@ -62,7 +62,6 @@ set(FLANG_TEST_DEPENDS
   llvm-readobj
   split-file
   FortranRuntime
-  Fortran_main
   FortranDecimal
 )
 if (LLVM_ENABLE_PLUGINS AND NOT WIN32)
diff --git a/flang/test/Driver/bbc-mlir-pass-pipeline.f90 b/flang/test/Driver/bbc-mlir-pass-pipeline.f90
index 7a35e26dc478..caa86e66e62b 100644
--- a/flang/test/Driver/bbc-mlir-pass-pipeline.f90
+++ b/flang/test/Driver/bbc-mlir-pass-pipeline.f90
@@ -17,7 +17,7 @@ end program
 ! CHECK-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! CHECK-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-! CHECK-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+! CHECK-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! CHECK-NEXT: 'fir.global' Pipeline
 ! CHECK-NEXT:   CharacterConversion
 ! CHECK-NEXT: 'func.func' Pipeline
@@ -25,6 +25,8 @@ end program
 ! CHECK-NEXT:   CharacterConversion
 ! CHECK-NEXT: 'omp.declare_reduction' Pipeline
 ! CHECK-NEXT:   CharacterConversion
+! CHECK-NEXT: 'omp.private' Pipeline
+! CHECK-NEXT:   CharacterConversion
 
 ! CHECK-NEXT: Canonicalizer
 ! CHECK-NEXT: SimplifyRegionLite
@@ -43,7 +45,7 @@ end program
 ! CHECK-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! CHECK-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-! CHECK-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+! CHECK-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! CHECK-NEXT: 'fir.global' Pipeline
 ! CHECK-NEXT:   CFGConversion
 ! CHECK-NEXT: 'func.func' Pipeline
@@ -51,6 +53,8 @@ end program
 ! CHECK-NEXT:   CFGConversion
 ! CHECK-NEXT: 'omp.declare_reduction' Pipeline
 ! CHECK-NEXT:   CFGConversion
+! CHECK-NEXT: 'omp.private' Pipeline
+! CHECK-NEXT:   CFGConversion
 
 ! CHECK-NEXT: SCFToControlFlow
 ! CHECK-NEXT: Canonicalizer
diff --git a/flang/test/Driver/driver-help-hidden.f90 b/flang/test/Driver/driver-help-hidden.f90
new file mode 100644
index 000000000000..706b2cb6c245
--- /dev/null
+++ b/flang/test/Driver/driver-help-hidden.f90
@@ -0,0 +1,172 @@
+
+!--------------------------
+! FLANG DRIVER (flang-new)
+!--------------------------
+! RUN: %flang --help-hidden 2>&1 | FileCheck %s
+! RUN: not %flang  -help-hidden 2>&1 | FileCheck %s --check-prefix=ERROR-FLANG
+
+!----------------------------------------
+! FLANG FRONTEND DRIVER (flang-new -fc1)
+!----------------------------------------
+! RUN: not %flang_fc1 --help-hidden 2>&1 | FileCheck %s --check-prefix=ERROR-FLANG-FC1
+! RUN: not %flang_fc1  -help-hidden 2>&1 | FileCheck %s --check-prefix=ERROR-FLANG-FC1
+
+! CHECK:USAGE: flang-new
+! CHECK-EMPTY:
+! CHECK-NEXT: DRIVER OPTIONS:
+! CHECK-NEXT:  --driver-mode=<value> Set the driver mode to either 'gcc', 'g++', 'cpp', 'cl' or 'flang'
+! CHECK-EMPTY:
+! CHECK-NEXT:OPTIONS:
+! CHECK-NEXT: -###                    Print (but do not run) the commands to run for this compilation
+! CHECK-NEXT: -ccc-print-phases       Dump list of actions to perform
+! CHECK-NEXT: -cpp                    Enable predefined and command line preprocessor macros
+! CHECK-NEXT: -c                      Only run preprocess, compile, and assemble steps
+! CHECK-NEXT: -dM                     Print macro definitions in -E mode instead of normal output
+! CHECK-NEXT: -dumpmachine            Display the compiler's target processor
+! CHECK-NEXT: -dumpversion            Display the version of the compiler
+! CHECK-NEXT: -D <macro>=<value>      Define <macro> to <value> (or 1 if <value> omitted)
+! CHECK-NEXT: -emit-llvm              Use the LLVM representation for assembler and object files
+! CHECK-NEXT: -E                      Only run the preprocessor
+! CHECK-NEXT: -falternative-parameter-statement
+! CHECK-NEXT:                         Enable the old style PARAMETER statement
+! CHECK-NEXT: -fapprox-func           Allow certain math function calls to be replaced with an approximately equivalent calculation
+! CHECK-NEXT: -fbackslash             Specify that backslash in string introduces an escape character
+! CHECK-NEXT: -fcolor-diagnostics     Enable colors in diagnostics
+! CHECK-NEXT: -fconvert=<value>       Set endian conversion of data for unformatted files
+! CHECK-NEXT: -fdefault-double-8      Set the default double precision kind to an 8 byte wide type
+! CHECK-NEXT: -fdefault-integer-8     Set the default integer and logical kind to an 8 byte wide type
+! CHECK-NEXT: -fdefault-real-8        Set the default real kind to an 8 byte wide type
+! CHECK-NEXT: -ffast-math             Allow aggressive, lossy floating-point optimizations
+! CHECK-NEXT: -ffixed-form            Process source files in fixed form
+! CHECK-NEXT: -ffixed-line-length=<value>
+! CHECK-NEXT:                         Use <value> as character line width in fixed mode
+! CHECK-NEXT: -ffp-contract=<value>   Form fused FP ops (e.g. FMAs)
+! CHECK-NEXT: -ffree-form             Process source files in free form
+! CHECK-NEXT: -fhonor-infinities      Specify that floating-point optimizations are not allowed that assume arguments and results are not +-inf.
+! CHECK-NEXT: -fhonor-nans            Specify that floating-point optimizations are not allowed that assume arguments and results are not NANs.
+! CHECK-NEXT: -fimplicit-none         No implicit typing allowed unless overridden by IMPLICIT statements
+! CHECK-NEXT: -finput-charset=<value> Specify the default character set for source files
+! CHECK-NEXT: -fintegrated-as         Enable the integrated assembler
+! CHECK-NEXT: -fintrinsic-modules-path <dir>
+! CHECK-NEXT:                         Specify where to find the compiled intrinsic modules
+! CHECK-NEXT: -flang-deprecated-no-hlfir
+! CHECK-NEXT:                         Do not use HLFIR lowering (deprecated)
+! CHECK-NEXT: -flang-experimental-hlfir
+! CHECK-NEXT:                         Use HLFIR lowering (experimental)
+! CHECK-NEXT: -flarge-sizes           Use INTEGER(KIND=8) for the result type in size-related intrinsics
+! CHECK-NEXT: -flogical-abbreviations Enable logical abbreviations
+! CHECK-NEXT: -flto=auto              Enable LTO in 'full' mode
+! CHECK-NEXT: -flto=jobserver         Enable LTO in 'full' mode
+! CHECK-NEXT: -flto=<value>           Set LTO mode
+! CHECK-NEXT: -flto                   Enable LTO in 'full' mode
+! CHECK-NEXT: -fms-runtime-lib=<value>
+! CHECK-NEXT:                         Select Windows run-time library
+! CHECK-NEXT: -fno-automatic          Implies the SAVE attribute for non-automatic local objects in subprograms unless RECURSIVE
+! CHECK-NEXT: -fno-color-diagnostics  Disable colors in diagnostics
+! CHECK-NEXT: -fno-integrated-as      Disable the integrated assembler
+! CHECK-NEXT: -fno-lto                Disable LTO mode (default)
+! CHECK-NEXT: -fno-ppc-native-vector-element-order
+! CHECK-NEXT:                         Specifies PowerPC non-native vector element order
+! CHECK-NEXT: -fno-rtlib-add-rpath Do not add -rpath with architecture-specific resource directory to the linker flags. When --hip-link is specified, do not add -rpath with HIP runtime library directory to the linker flags
+! CHECK-NEXT: -fno-signed-zeros       Allow optimizations that ignore the sign of floating point zeros
+! CHECK-NEXT: -fno-stack-arrays       Allocate array temporaries on the heap (default)
+! CHECK-NEXT: -fno-version-loops-for-stride
+! CHECK-NEXT:                         Do not create unit-strided loops (default)
+! CHECK-NEXT: -fomit-frame-pointer    Omit the frame pointer from functions that don't need it. Some stack unwinding cases, such as profilers and sanitizers, may prefer specifying -fno-omit-frame-pointer. On many targets, -O1 and higher omit the frame pointer by default. -m[no-]omit-leaf-frame-pointer takes precedence for leaf functions
+! CHECK-NEXT: -fopenacc               Enable OpenACC
+! CHECK-NEXT: -fopenmp-assume-no-nested-parallelism
+! CHECK-NEXT:                         Assert no nested parallel regions in the GPU
+! CHECK-NEXT: -fopenmp-assume-no-thread-state
+! CHECK-NEXT:                         Assert no thread in a parallel region modifies an ICV
+! CHECK-NEXT: -fopenmp-target-debug   Enable debugging in the OpenMP offloading device RTL
+! CHECK-NEXT: -fopenmp-targets=<value>
+! CHECK-NEXT:                         Specify comma-separated list of triples OpenMP offloading targets to be supported
+! CHECK-NEXT: -fopenmp-version=<value>
+! CHECK-NEXT:                         Set OpenMP version (e.g. 45 for OpenMP 4.5, 51 for OpenMP 5.1). Default value is 11 for Flang
+! CHECK-NEXT: -fopenmp                Parse OpenMP pragmas and generate parallel code.
+! CHECK-NEXT: -foptimization-record-file=<file>
+! CHECK-NEXT:                         Specify the output name of the file containing the optimization remarks. Implies -fsave-optimization-record. On Darwin platforms, this cannot be used with multiple -arch <arch> options.
+! CHECK-NEXT: -foptimization-record-passes=<regex>
+! CHECK-NEXT:                         Only include passes which match a specified regular expression in the generated optimization record (by default, include all passes)
+! CHECK-NEXT: -fpass-plugin=<dsopath> Load pass plugin from a dynamic shared object file (only with new pass manager).
+! CHECK-NEXT: -fppc-native-vector-element-order
+! CHECK-NEXT:                         Specifies PowerPC native vector element order (default)
+! CHECK-NEXT: -freciprocal-math       Allow division operations to be reassociated
+! CHECK-NEXT: -fropi                  Generate read-only position independent code (ARM only)
+! CHECK-NEXT: -frtlib-add-rpath Add -rpath with architecture-specific resource directory to the linker flags. When --hip-link is specified, also add -rpath with HIP runtime library directory to the linker flags
+! CHECK-NEXT: -frwpi                  Generate read-write position independent code (ARM only)
+! CHECK-NEXT: -fsave-optimization-record=<format>
+! CHECK-NEXT:                         Generate an optimization record file in a specific format
+! CHECK-NEXT: -fsave-optimization-record
+! CHECK-NEXT:                         Generate a YAML optimization record file
+! CHECK-NEXT: -fstack-arrays          Attempt to allocate array temporaries on the stack, no matter their size
+! CHECK-NEXT: -fsyntax-only           Run the preprocessor, parser and semantic analysis stages
+! CHECK-NEXT: -funderscoring          Appends one trailing underscore to external names
+! CHECK-NEXT: -fveclib=<value>        Use the given vector functions library
+! CHECK-NEXT: -fversion-loops-for-stride
+! CHECK-NEXT:                         Create unit-strided versions of loops
+! CHECK-NEXT: -fxor-operator          Enable .XOR. as a synonym of .NEQV.
+! CHECK-NEXT: --gcc-install-dir=<value>
+! CHECK-NEXT:                         Use GCC installation in the specified directory. The directory ends with path components like 'lib{,32,64}/gcc{,-cross}/$triple/$version'. Note: executables (e.g. ld) used by the compiler are not overridden by the selected GCC installation
+! CHECK-NEXT: --gcc-toolchain=<value> Specify a directory where Flang can find 'lib{,32,64}/gcc{,-cross}/$triple/$version'. Flang will use the GCC installation with the largest version
+! CHECK-NEXT: -gline-directives-only  Emit debug line info directives only
+! CHECK-NEXT: -gline-tables-only      Emit debug line number tables only
+! CHECK-NEXT: -gpulibc                Link the LLVM C Library for GPUs
+! CHECK-NEXT: -g                      Generate source-level debug information
+! CHECK-NEXT: --help-hidden           Display help for hidden options
+! CHECK-NEXT: -help                   Display available options
+! CHECK-NEXT: -isysroot <dir>         Set the system root directory (usually /)
+! CHECK-NEXT: -I <dir>                Add directory to the end of the list of include search paths
+! CHECK-NEXT: -L <dir>                Add directory to library search path
+! CHECK-NEXT: -march=<value>          For a list of available architectures for the target use '-mcpu=help'
+! CHECK-NEXT: -mcode-object-version=<value>
+! CHECK-NEXT:                         Specify code object ABI version. Defaults to 5. (AMDGPU only)
+! CHECK-NEXT: -mcpu=<value>           For a list of available CPUs for the target use '-mcpu=help'
+! CHECK-NEXT: -mllvm=<arg>            Alias for -mllvm
+! CHECK-NEXT: -mllvm <value>          Additional arguments to forward to LLVM's option processing
+! CHECK-NEXT: -mmlir <value>          Additional arguments to forward to MLIR's option processing
+! CHECK-NEXT: -mno-outline-atomics    Don't generate local calls to out-of-line atomic operations
+! CHECK-NEXT: -module-dir <dir>       Put MODULE files in <dir>
+! CHECK-NEXT: -moutline-atomics       Generate local calls to out-of-line atomic operations
+! CHECK-NEXT: -mrvv-vector-bits=<value>
+! CHECK-NEXT:                         Specify the size in bits of an RVV vector register
+! CHECK-NEXT: -msve-vector-bits=<value>
+! CHECK-NEXT:                          Specify the size in bits of an SVE vector register. Defaults to the vector length agnostic value of "scalable". (AArch64 only)
+! CHECK-NEXT: --no-offload-arch=<value>
+! CHECK-NEXT:                         Remove CUDA/HIP offloading device architecture (e.g. sm_35, gfx906) from the list of devices to compile for. 'all' resets the list to its default value.
+! CHECK-NEXT: -nocpp                  Disable predefined and command line preprocessor macros
+! CHECK-NEXT: -nogpulib               Do not link device library for CUDA/HIP device compilation
+! CHECK-NEXT: --offload-arch=<value>  Specify an offloading device architecture for CUDA, HIP, or OpenMP. (e.g. sm_35). If 'native' is used the compiler will detect locally installed architectures. For HIP offloading, the device architecture can be followed by target ID features delimited by a colon (e.g. gfx908:xnack+:sramecc-). May be specified more than once.
+! CHECK-NEXT: --offload-device-only   Only compile for the offloading device.
+! CHECK-NEXT: --offload-host-device   Compile for both the offloading host and device (default).
+! CHECK-NEXT: --offload-host-only     Only compile for the offloading host.
+! CHECK-NEXT: -o <file>               Write output to <file>
+! CHECK-NEXT: -pedantic               Warn on language extensions
+! CHECK-NEXT: -print-effective-triple Print the effective target triple
+! CHECK-NEXT: -print-target-triple    Print the normalized target triple
+! CHECK-NEXT: -pthread                Support POSIX threads in generated code
+! CHECK-NEXT: -P                      Disable linemarker output in -E mode
+! CHECK-NEXT: -resource-dir <value>   The directory which holds the compiler resource files
+! CHECK-NEXT: --rocm-path=<value> ROCm installation path, used for finding and automatically linking required bitcode libraries.
+! CHECK-NEXT: -Rpass-analysis=<value> Report transformation analysis from optimization passes whose name matches the given POSIX regular expression
+! CHECK-NEXT: -Rpass-missed=<value>   Report missed transformations by optimization passes whose name matches the given POSIX regular expression
+! CHECK-NEXT: -Rpass=<value>          Report transformations performed by optimization passes whose name matches the given POSIX regular expression
+! CHECK-NEXT: -R<remark>              Enable the specified remark
+! CHECK-NEXT: -save-temps=<value>     Save intermediate compilation results.
+! CHECK-NEXT: -save-temps             Alias for --save-temps=cwd
+! CHECK-NEXT: -std=<value>            Language standard to compile for
+! CHECK-NEXT: -S                      Only run preprocess and compilation steps
+! CHECK-NEXT: --target=<value>        Generate code for the given target
+! CHECK-NEXT: -U <macro>              Undefine macro <macro>
+! CHECK-NEXT: --version               Print version information
+! CHECK-NEXT: -v                      Show commands to run and use verbose output
+! CHECK-NEXT: -Wl,<arg>               Pass the comma separated arguments in <arg> to the linker
+! CHECK-NEXT: -W<warning>             Enable the specified warning
+! CHECK-NEXT: -Xflang <arg>           Pass <arg> to the flang compiler
+! CHECK-NEXT: -x <language>           Treat subsequent input files as having type <language>
+
+
+! ERROR-FLANG: error: unknown argument '-help-hidden'; did you mean '--help-hidden'?
+
+! Frontend driver -help-hidden is not supported
+! ERROR-FLANG-FC1: error: unknown argument: '{{.*}}'
diff --git a/flang/test/Driver/dynamic-linker.f90 b/flang/test/Driver/dynamic-linker.f90
index 7c3f1b5a53fe..6d5c443ab75c 100644
--- a/flang/test/Driver/dynamic-linker.f90
+++ b/flang/test/Driver/dynamic-linker.f90
@@ -16,7 +16,6 @@
 ! GNU-LINKER-OPTIONS-SAME: "-shared"
 ! GNU-LINKER-OPTIONS-SAME: "-static"
 ! GNU-LINKER-OPTIONS-SAME: "-rpath" "/path/to/dir"
-! GNU-LINKER-OPTIONS-NOT: "-lFortran_main.a"
 
 ! RDYNAMIC-LINKER-OPTION: "{{.*}}ld"
 ! RDYNAMIC-LINKER-OPTION-SAME: "-export-dynamic"
@@ -25,4 +24,3 @@
 ! MSVC-LINKER-OPTIONS: "{{.*}}link{{(.exe)?}}"
 ! MSVC-LINKER-OPTIONS-SAME: "-dll"
 ! MSVC-LINKER-OPTIONS-SAME: "-rpath" "/path/to/dir"
-! MSVC-LINKER-OPTIONS-NOT: "/WHOLEARCHIVE:Fortran_main"
diff --git a/flang/test/Driver/emit-mlir.f90 b/flang/test/Driver/emit-mlir.f90
index 191ee13396ef..83bb8fc1eddc 100644
--- a/flang/test/Driver/emit-mlir.f90
+++ b/flang/test/Driver/emit-mlir.f90
@@ -19,6 +19,16 @@
 ! CHECK-NEXT:  %[[VAL_0:.*]] = fir.zero_bits !fir.ref<tuple<i[[int_size]], !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>
 ! CHECK-NEXT: fir.has_value  %[[VAL_0]] : !fir.ref<tuple<i[[int_size]], !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>
 ! CHECK-NEXT: }
+! CHECK-NEXT: func.func private @_FortranAProgramStart(i32, !llvm.ptr, !llvm.ptr, !llvm.ptr)
+! CHECK-NEXT: func.func private @_FortranAProgramEndStatement()
+! CHECK-NEXT: func.func @main(%arg0: i32, %arg1: !llvm.ptr, %arg2: !llvm.ptr) -> i32 {
+! CHECK-NEXT: %c0_i32 = arith.constant 0 : i32
+! CHECK-NEXT: %0 = fir.address_of(@_QQEnvironmentDefaults) : !fir.ref<tuple<i32, !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>
+! CHECK-NEXT: ir.call @_FortranAProgramStart(%arg0, %arg1, %arg2, %0) {{.*}} : (i32, !llvm.ptr, !llvm.ptr, !fir.ref<tuple<i32, !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>) 
+! CHECK-NEXT: fir.call @_QQmain() fastmath<contract> : () -> ()
+! CHECK-NEXT: fir.call @_FortranAProgramEndStatement() {{.*}} : () -> ()
+! CHECK-NEXT: return %c0_i32 : i32
+! CHECK-NEXT: }
 ! CHECK-NEXT: }
 
 end program
diff --git a/flang/test/Driver/linker-flags.f90 b/flang/test/Driver/linker-flags.f90
index 4d3d528b5e99..02e217494f81 100644
--- a/flang/test/Driver/linker-flags.f90
+++ b/flang/test/Driver/linker-flags.f90
@@ -11,7 +11,6 @@
 ! RUN: %flang -### --target=x86_64-unknown-dragonfly %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128%f128-lib
 ! RUN: %flang -### --target=x86_64-unknown-haiku %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,HAIKU,HAIKU-F128%f128-lib
 ! RUN: %flang -### --target=x86_64-windows-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,MINGW,MINGW-F128%f128-lib
-! RUN: %flang -### --target=aarch64-unknown-linux-gnu %S/Inputs/hello.f90 -lFortran_main 2>&1 | FileCheck %s --check-prefixes=DEPRECATED
 
 ! NOTE: Clang's driver library, clangDriver, usually adds 'oldnames' on Windows,
 !       but it is not needed when compiling Fortran code and they might bring in
@@ -29,7 +28,6 @@
 !       executable and may find the GNU linker from MinGW or Cygwin.
 ! UNIX-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! UNIX-SAME: "[[object_file]]"
-! UNIX-SAME: "--whole-archive" "-lFortran_main" "--no-whole-archive"
 ! UNIX-F128NONE-NOT: FortranFloat128Math
 ! SOLARIS-F128NONE-NOT: FortranFloat128Math
 ! UNIX-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed"
@@ -38,7 +36,6 @@
 
 ! DARWIN-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! DARWIN-SAME: "[[object_file]]"
-! DARWIN-SAME: -lFortran_main
 ! DARWIN-F128NONE-NOT: FortranFloat128Math
 ! DARWIN-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed"
 ! DARWIN-SAME: -lFortranRuntime
@@ -46,14 +43,12 @@
 
 ! HAIKU-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! HAIKU-SAME: "[[object_file]]"
-! HAIKU-SAME: "--whole-archive" "-lFortran_main" "--no-whole-archive"
 ! HAIKU-F128NONE-NOT: FortranFloat128Math
 ! HAIKU-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed"
 ! HAIKU-SAME: "-lFortranRuntime" "-lFortranDecimal"
 
 ! MINGW-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! MINGW-SAME: "[[object_file]]"
-! MINGW-SAME: -lFortran_main
 ! MINGW-F128NONE-NOT: FortranFloat128Math
 ! MINGW-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed"
 ! MINGW-SAME: -lFortranRuntime
@@ -66,6 +61,3 @@
 ! MSVC-LABEL: link
 ! MSVC-SAME: /subsystem:console
 ! MSVC-SAME: "[[object_file]]"
-
-! Check that we warn when using -lFortran_main
-! DEPRECATED: warning: argument '-lFortran_main' is deprecated, see the Flang driver documentation for correct usage [-Wdeprecated]
diff --git a/flang/test/Driver/mlir-debug-pass-pipeline.f90 b/flang/test/Driver/mlir-debug-pass-pipeline.f90
index 28d70bc15264..2c81441e7ec9 100644
--- a/flang/test/Driver/mlir-debug-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-debug-pass-pipeline.f90
@@ -39,7 +39,7 @@ end program
 ! ALL-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT: 'fir.global' Pipeline
 ! ALL-NEXT:   CharacterConversion
 ! ALL-NEXT: 'func.func' Pipeline
@@ -47,6 +47,8 @@ end program
 ! ALL-NEXT:   CharacterConversion
 ! ALL-NEXT: 'omp.declare_reduction' Pipeline
 ! ALL-NEXT:   CharacterConversion
+! ALL-NEXT: 'omp.private' Pipeline
+! ALL-NEXT:   CharacterConversion
 
 ! ALL-NEXT: Canonicalizer
 ! ALL-NEXT: SimplifyRegionLite
@@ -63,7 +65,7 @@ end program
 ! ALL-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT:   'fir.global' Pipeline
 ! ALL-NEXT:     CFGConversion
 ! ALL-NEXT:   'func.func' Pipeline
@@ -71,6 +73,8 @@ end program
 ! ALL-NEXT:     CFGConversion
 ! ALL-NEXT:   'omp.declare_reduction' Pipeline
 ! ALL-NEXT:     CFGConversion
+! ALL-NEXT:   'omp.private' Pipeline
+! ALL-NEXT:     CFGConversion
 ! ALL-NEXT: SCFToControlFlow
 ! ALL-NEXT: Canonicalizer
 ! ALL-NEXT: SimplifyRegionLite
@@ -79,13 +83,15 @@ end program
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 ! ALL-NEXT: BoxedProcedurePass
 
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT:   'fir.global' Pipeline
 ! ALL-NEXT:     AbstractResultOpt
 ! ALL-NEXT:   'func.func' Pipeline
 ! ALL-NEXT:     AbstractResultOpt
 ! ALL-NEXT:   'omp.declare_reduction' Pipeline
 ! ALL-NEXT:     AbstractResultOpt
+! ALL-NEXT:   'omp.private' Pipeline
+! ALL-NEXT:     AbstractResultOpt
 
 ! ALL-NEXT: CodeGenRewrite
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations eliminated
diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90
index 41f3c203e435..320467a2ac2a 100644
--- a/flang/test/Driver/mlir-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-pass-pipeline.f90
@@ -28,7 +28,7 @@ end program
 ! ALL-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT: 'fir.global' Pipeline
 ! ALL-NEXT:   CharacterConversion
 ! ALL-NEXT: 'func.func' Pipeline
@@ -36,6 +36,8 @@ end program
 ! ALL-NEXT:   CharacterConversion
 ! ALL-NEXT: 'omp.declare_reduction' Pipeline
 ! ALL-NEXT:   CharacterConversion
+! ALL-NEXT: 'omp.private' Pipeline
+! ALL-NEXT:   CharacterConversion
 
 ! ALL-NEXT: Canonicalizer
 ! ALL-NEXT: SimplifyRegionLite
@@ -57,7 +59,7 @@ end program
 ! O2-NEXT:    'func.func' Pipeline
 ! O2-NEXT:      PolymorphicOpConversion
 ! O2-NEXT:  AddAliasTags
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT:    'fir.global' Pipeline
 ! ALL-NEXT:      CFGConversion
 ! ALL-NEXT:    'func.func' Pipeline
@@ -65,6 +67,8 @@ end program
 ! ALL-NEXT:      CFGConversion
 ! ALL-NEXT:   'omp.declare_reduction' Pipeline
 ! ALL-NEXT:      CFGConversion
+! ALL-NEXT:   'omp.private' Pipeline
+! ALL-NEXT:      CFGConversion
 
 ! ALL-NEXT: SCFToControlFlow
 ! ALL-NEXT: Canonicalizer
@@ -74,13 +78,15 @@ end program
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 ! ALL-NEXT: BoxedProcedurePass
 
-! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 ! ALL-NEXT:   'fir.global' Pipeline
 ! ALL-NEXT:    AbstractResultOpt
 ! ALL-NEXT:  'func.func' Pipeline
 ! ALL-NEXT:    AbstractResultOpt
 ! ALL-NEXT:  'omp.declare_reduction' Pipeline
 ! ALL-NEXT:    AbstractResultOpt
+! ALL-NEXT:  'omp.private' Pipeline
+! ALL-NEXT:    AbstractResultOpt
 
 ! ALL-NEXT: CodeGenRewrite
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations eliminated
diff --git a/flang/test/Driver/msvc-dependent-lib-flags.f90 b/flang/test/Driver/msvc-dependent-lib-flags.f90
index 6cfc969e92b2..765917f07d8e 100644
--- a/flang/test/Driver/msvc-dependent-lib-flags.f90
+++ b/flang/test/Driver/msvc-dependent-lib-flags.f90
@@ -7,7 +7,6 @@
 ! MSVC-SAME: --dependent-lib=clang_rt.builtins.lib
 ! MSVC-SAME: -D_MT
 ! MSVC-SAME: --dependent-lib=libcmt
-! MSVC-SAME: --dependent-lib=Fortran_main.static.lib
 ! MSVC-SAME: --dependent-lib=FortranRuntime.static.lib
 ! MSVC-SAME: --dependent-lib=FortranDecimal.static.lib
 
@@ -16,7 +15,6 @@
 ! MSVC-DEBUG-SAME: -D_MT
 ! MSVC-DEBUG-SAME: -D_DEBUG
 ! MSVC-DEBUG-SAME: --dependent-lib=libcmtd
-! MSVC-DEBUG-SAME: --dependent-lib=Fortran_main.static_dbg.lib
 ! MSVC-DEBUG-SAME: --dependent-lib=FortranRuntime.static_dbg.lib
 ! MSVC-DEBUG-SAME: --dependent-lib=FortranDecimal.static_dbg.lib
 
@@ -25,7 +23,6 @@
 ! MSVC-DLL-SAME: -D_MT
 ! MSVC-DLL-SAME: -D_DLL
 ! MSVC-DLL-SAME: --dependent-lib=msvcrt
-! MSVC-DLL-SAME: --dependent-lib=Fortran_main.dynamic.lib
 ! MSVC-DLL-SAME: --dependent-lib=FortranRuntime.dynamic.lib
 ! MSVC-DLL-SAME: --dependent-lib=FortranDecimal.dynamic.lib
 
@@ -35,6 +32,5 @@
 ! MSVC-DLL-DEBUG-SAME: -D_DEBUG
 ! MSVC-DLL-DEBUG-SAME: -D_DLL
 ! MSVC-DLL-DEBUG-SAME: --dependent-lib=msvcrtd
-! MSVC-DLL-DEBUG-SAME: --dependent-lib=Fortran_main.dynamic_dbg.lib
 ! MSVC-DLL-DEBUG-SAME: --dependent-lib=FortranRuntime.dynamic_dbg.lib
 ! MSVC-DLL-DEBUG-SAME: --dependent-lib=FortranDecimal.dynamic_dbg.lib
diff --git a/flang/test/Driver/no-duplicate-main.f90 b/flang/test/Driver/no-duplicate-main.f90
index 88f4430828e0..b0bb6c2a2fef 100644
--- a/flang/test/Driver/no-duplicate-main.f90
+++ b/flang/test/Driver/no-duplicate-main.f90
@@ -4,8 +4,6 @@
 ! RUN: %flang -o %t -c %s
 ! RUN: not %flang -o %t.exe %t %t.c-object 2>&1
 
-! RUN: %flang -fno-fortran-main -o %t.exe %t %t.c-object 2>&1
-
 ! TODO: potentially add further checks to ensure that proper
 !       linker error messages are detected and checked via
 !       FileCheck.
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index 7508963a3d51..d54b0895cc33 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -34,7 +34,7 @@ func.func @_QQmain() {
 // PASSES-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 // PASSES-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 // PASSES-NEXT: 'fir.global' Pipeline
 // PASSES-NEXT:   CharacterConversion
 // PASSES-NEXT: 'func.func' Pipeline
@@ -42,6 +42,8 @@ func.func @_QQmain() {
 // PASSES-NEXT:   CharacterConversion
 // PASSES-NEXT: 'omp.declare_reduction' Pipeline
 // PASSES-NEXT:   CharacterConversion
+// PASSES-NEXT: 'omp.private' Pipeline
+// PASSES-NEXT:   CharacterConversion
 
 // PASSES-NEXT: Canonicalizer
 // PASSES-NEXT: SimplifyRegionLite
@@ -65,13 +67,15 @@ func.func @_QQmain() {
 
 // PASSES-NEXT: AddAliasTags
 
-// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
+// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
 // PASSES-NEXT: 'fir.global' Pipeline
 // PASSES-NEXT:   CFGConversion
 // PASSES-NEXT: 'func.func' Pipeline
 // PASSES-NEXT:   CFGConversion
 // PASSES-NEXT: 'omp.declare_reduction' Pipeline
 // PASSES-NEXT:   CFGConversion
+// PASSES-NEXT: 'omp.private' Pipeline
+// PASSES-NEXT:   CFGConversion
 
 // PASSES-NEXT: SCFToControlFlow
 // PASSES-NEXT: Canonicalizer
@@ -81,13 +85,15 @@ func.func @_QQmain() {
 // PASSES-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 // PASSES-NEXT: BoxedProcedurePass
 
-// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction']
-// PASSES-NEXT:   'fir.global' Pipeline
+// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+// PASSES-NEXT:  'fir.global' Pipeline
 // PASSES-NEXT:    AbstractResultOpt
 // PASSES-NEXT:  'func.func' Pipeline
 // PASSES-NEXT:    AbstractResultOpt
 // PASSES-NEXT:  'omp.declare_reduction' Pipeline
 // PASSES-NEXT:    AbstractResultOpt
+// PASSES-NEXT:  'omp.private' Pipeline
+// PASSES-NEXT:    AbstractResultOpt
 
 // PASSES-NEXT: CodeGenRewrite
 // PASSES-NEXT:   (S) 0 num-dce'd - Number of operations eliminated
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
index 70483685d200..0a2608639bce 100644
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -141,3 +141,21 @@ end subroutine
 ! CHECK: fir.cuda_kernel<<<*, *>>>
 ! CHECK-NOT: fir.cuda_data_transfer
 ! CHECK: hlfir.assign
+
+attributes(global) subroutine sub5(a)
+  integer, device :: a
+  integer :: i
+  a = i
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub5
+! CHECK-NOT: fir.cuda_data_transfer
+
+attributes(host,device) subroutine sub6(a)
+  integer, device :: a
+  integer :: i
+  a = i
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub6
+! CHECK: fir.cuda_data_transfer
diff --git a/flang/test/Lower/OpenMP/FIR/array-bounds.f90 b/flang/test/Lower/OpenMP/FIR/array-bounds.f90
deleted file mode 100644
index c2bb7a94712b..000000000000
--- a/flang/test/Lower/OpenMP/FIR/array-bounds.f90
+++ /dev/null
@@ -1,121 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes=HOST,ALL
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefixes=DEVICE,ALL
-
-!ALL-LABEL: func.func @_QPread_write_section(
-!ALL:  %[[ITER:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFread_write_sectionEi"}
-!ALL:  %[[READ:.*]] = fir.address_of(@_QFread_write_sectionEsp_read) : !fir.ref<!fir.array<10xi32>>
-!ALL:  %[[C10:.*]] = arith.constant 10 : index
-!ALL:  %[[WRITE:.*]] = fir.address_of(@_QFread_write_sectionEsp_write) : !fir.ref<!fir.array<10xi32>>
-!ALL:  %[[C10_0:.*]] = arith.constant 10 : index
-!ALL:  %[[C1:.*]] = arith.constant 1 : index
-!ALL:  %[[C2:.*]] = arith.constant 1 : index
-!ALL:  %[[C3:.*]] = arith.constant 4 : index
-!ALL:  %[[BOUNDS0:.*]] = omp.map.bounds   lower_bound(%[[C2]] : index) upper_bound(%[[C3]] : index) extent(%[[C10]] : index) stride(%[[C1]] : index) start_idx(%[[C1]] : index)
-!ALL:  %[[MAP0:.*]] = omp.map.info var_ptr(%[[READ]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS0]]) -> !fir.ref<!fir.array<10xi32>> {name = "sp_read(2:5)"}
-!ALL:  %[[C4:.*]] = arith.constant 1 : index
-!ALL:  %[[C5:.*]] = arith.constant 1 : index
-!ALL:  %[[C6:.*]] = arith.constant 4 : index
-!ALL:  %[[BOUNDS1:.*]] = omp.map.bounds   lower_bound(%[[C5]] : index) upper_bound(%[[C6]] : index) extent(%[[C10_0]] : index) stride(%[[C4]] : index) start_idx(%[[C4]] : index)
-!ALL:  %[[MAP1:.*]] = omp.map.info var_ptr(%[[WRITE]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS1]]) -> !fir.ref<!fir.array<10xi32>> {name = "sp_write(2:5)"}
-!ALL:  %[[MAP2:.*]] = omp.map.info var_ptr(%[[ITER]] : !fir.ref<i32>, i32)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "i"}
-!ALL: omp.target map_entries(%[[MAP0]] -> %{{.*}}, %[[MAP1]] -> %{{.*}}, %[[MAP2]] -> %{{.*}} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>, !fir.ref<i32>) {
-
-subroutine read_write_section()
-    integer :: sp_read(10) = (/1,2,3,4,5,6,7,8,9,10/)
-    integer :: sp_write(10) = (/0,0,0,0,0,0,0,0,0,0/)
-
-!$omp target map(tofrom:sp_read(2:5)) map(tofrom:sp_write(2:5))
-    do i = 2, 5
-        sp_write(i) = sp_read(i)
-    end do
-!$omp end target
-end subroutine read_write_section
-
-module assumed_array_routines
-contains
-!ALL-LABEL: func.func @_QMassumed_array_routinesPassumed_shape_array(
-!ALL-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "arr_read_write"})
-!ALL: %[[INTERMEDIATE_ALLOCA:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
-!ALL: %[[ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMassumed_array_routinesFassumed_shape_arrayEi"}
-!ALL: %[[C0:.*]] = arith.constant 1 : index
-!ALL: %[[C1:.*]] = arith.constant 0 : index
-!ALL: %[[DIMS0:.*]]:3 = fir.box_dims %arg0, %[[C1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-!ALL: %[[C3:.*]] = arith.constant 1 : index
-!ALL: %[[C4:.*]] = arith.constant 4 : index
-!ALL: %[[C0_1:.*]] = arith.constant 0 : index
-!ALL: %[[DIMS1:.*]]:3 = fir.box_dims %arg0, %[[C0_1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-!ALL: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound(%[[C3]] : index) upper_bound(%[[C4]] : index) extent(%[[DIMS1]]#1 : index) stride(%[[DIMS0]]#2 : index) start_idx(%[[C0]] : index) {stride_in_bytes = true}
-!ALL: %[[BOXADDRADDR:.*]] = fir.box_offset %0 base_addr : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
-!ALL: %[[MAP_MEMBER:.*]] = omp.map.info var_ptr(%0 : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.array<?xi32>) var_ptr_ptr(%[[BOXADDRADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
-!ALL: %[[MAP:.*]] = omp.map.info var_ptr(%0 : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.box<!fir.array<?xi32>>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBER]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.array<?xi32>> {name = "arr_read_write(2:5)"}
-!ALL: %[[MAP2:.*]] = omp.map.info var_ptr(%[[ALLOCA]] : !fir.ref<i32>, i32)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "i"}
-!ALL: omp.target map_entries(%[[MAP_MEMBER]] -> %{{.*}}, %[[MAP]] -> %{{.*}}, %[[MAP2]] -> %{{.*}} : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<!fir.array<?xi32>>, !fir.ref<i32>) {
-    subroutine assumed_shape_array(arr_read_write)
-        integer, intent(inout) :: arr_read_write(:)
-
-        !$omp target map(tofrom:arr_read_write(2:5))
-            do i = 2, 5
-                arr_read_write(i) = i
-            end do
-        !$omp end target
-    end subroutine assumed_shape_array
-
-!ALL-LABEL:   func.func @_QMassumed_array_routinesPassumed_size_array(
-!ALL-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xi32>> {fir.bindc_name = "arr_read_write"})
-!ALL: %[[ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMassumed_array_routinesFassumed_size_arrayEi"}
-!ALL: %[[C0:.*]] = arith.constant 1 : index
-!ALL: %[[C1:.*]] = arith.constant 1 : index
-!ALL: %[[C2:.*]] = arith.constant 4 : index
-!ALL: %[[DIFF:.*]] = arith.subi %[[C2]], %[[C1]] : index
-!ALL: %[[EXT:.*]] = arith.addi %[[DIFF]], %[[C0]] : index
-!ALL: %[[BOUNDS:.*]]  = omp.map.bounds   lower_bound(%[[C1]] : index) upper_bound(%[[C2]] : index) extent(%[[EXT]] : index) stride(%[[C0]] : index) start_idx(%[[C0]] : index)
-!ALL: %[[MAP:.*]] = omp.map.info var_ptr(%[[ARG0]] : !fir.ref<!fir.array<?xi32>>, !fir.array<?xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<?xi32>> {name = "arr_read_write(2:5)"}
-!ALL: %[[MAP2:.*]] = omp.map.info var_ptr(%[[ALLOCA]] : !fir.ref<i32>, i32)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "i"}
-!ALL: omp.target map_entries(%[[MAP]] -> %{{.*}}, %[[MAP2]] -> %{{.*}} : !fir.ref<!fir.array<?xi32>>, !fir.ref<i32>) {
-    subroutine assumed_size_array(arr_read_write)
-        integer, intent(inout) :: arr_read_write(*)
-
-    !$omp target map(tofrom:arr_read_write(2:5))
-        do i = 2, 5
-            arr_read_write(i) = i
-        end do
-    !$omp end target
-    end subroutine assumed_size_array
-end module assumed_array_routines
-
-!DEVICE-NOT:func.func @_QPcall_assumed_shape_and_size_array() {
-
-!HOST-LABEL:func.func @_QPcall_assumed_shape_and_size_array() {
-!HOST:%{{.*}} = arith.constant 20 : index
-!HOST:%[[ALLOCA:.*]] = fir.alloca !fir.array<20xi32> {bindc_name = "arr_read_write", uniq_name = "_QFcall_assumed_shape_and_size_arrayEarr_read_write"}
-!HOST:%{{.*}} = arith.constant 1 : i64
-!HOST:%{{.*}} = fir.convert %{{.*}} : (i64) -> index
-!HOST:%{{.*}} = arith.constant 1 : i64
-!HOST:%{{.*}} = fir.convert %{{.*}} : (i64) -> index
-!HOST:%{{.*}} = arith.constant 10 : i64
-!HOST:%{{.*}} = fir.convert %{{.*}} : (i64) -> index
-!HOST:%[[SHAPE0:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-!HOST:%[[SLICE0:.*]] = fir.slice %{{.*}}, %{{.*}}, %{{.*}} : (index, index, index) -> !fir.slice<1>
-!HOST:%[[ARG0EMB:.*]] = fir.embox %[[ALLOCA]](%[[SHAPE0]]) [%[[SLICE0]]] : (!fir.ref<!fir.array<20xi32>>, !fir.shape<1>, !fir.slice<1>) -> !fir.box<!fir.array<10xi32>>
-!HOST:%[[ARG0:.*]] = fir.convert %[[ARG0EMB]] : (!fir.box<!fir.array<10xi32>>) -> !fir.box<!fir.array<?xi32>>
-!HOST:fir.call @_QMassumed_array_routinesPassumed_shape_array(%[[ARG0]]) fastmath<contract> : (!fir.box<!fir.array<?xi32>>) -> ()
-!HOST:%{{.*}} = arith.constant 10 : i64
-!HOST:%{{.*}} = fir.convert %{{.*}} : (i64) -> index
-!HOST:%{{.*}} = arith.constant 1 : i64
-!HOST:%{{.*}} = fir.convert %{{.*}} : (i64) -> index
-!HOST:%{{.*}} = arith.constant 20 : i64
-!HOST:%{{.*}} = fir.convert %{{.*}} : (i64) -> index
-!HOST:%[[SHAPE1:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-!HOST:%[[SLICE1:.*]] = fir.slice %{{.*}}, %{{.*}}, %{{.*}} : (index, index, index) -> !fir.slice<1>
-!HOST:%[[ARG1EMB:.*]] = fir.embox %[[ALLOCA]](%[[SHAPE1]]) [%[[SLICE1]]] : (!fir.ref<!fir.array<20xi32>>, !fir.shape<1>, !fir.slice<1>) -> !fir.box<!fir.array<11xi32>>
-!HOST:%[[ADDROF:.*]] = fir.box_addr %[[ARG1EMB]] : (!fir.box<!fir.array<11xi32>>) -> !fir.ref<!fir.array<11xi32>>
-!HOST:%[[ARG1:.*]] = fir.convert %[[ADDROF]] : (!fir.ref<!fir.array<11xi32>>) -> !fir.ref<!fir.array<?xi32>>
-!HOST:fir.call @_QMassumed_array_routinesPassumed_size_array(%[[ARG1]]) fastmath<contract> : (!fir.ref<!fir.array<?xi32>>) -> ()
-!HOST:return
-!HOST:}
-subroutine call_assumed_shape_and_size_array
-    use assumed_array_routines
-    integer :: arr_read_write(20)
-    call assumed_shape_array(arr_read_write(1:10))
-    call assumed_size_array(arr_read_write(10:20))
-end subroutine call_assumed_shape_and_size_array
diff --git a/flang/test/Lower/OpenMP/FIR/atomic-capture.f90 b/flang/test/Lower/OpenMP/FIR/atomic-capture.f90
deleted file mode 100644
index 9b94214b9da8..000000000000
--- a/flang/test/Lower/OpenMP/FIR/atomic-capture.f90
+++ /dev/null
@@ -1,119 +0,0 @@
-! REQUIRES: openmp_runtime
-
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-! This test checks the lowering of atomic capture
-
-program OmpAtomicCapture
-    use omp_lib                                                                                                       
-    integer :: x, y
-
-!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
-!CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: omp.atomic.capture   memory_order(release) {
-!CHECK: omp.atomic.read %[[X]] = %[[Y]] : !fir.ref<i32>
-!CHECK: omp.atomic.update %[[Y]] : !fir.ref<i32> {
-!CHECK: ^bb0(%[[ARG:.*]]: i32):
-!CHECK: %[[result:.*]] = arith.addi %[[temp]], %[[ARG]] : i32
-!CHECK: omp.yield(%[[result]] : i32)
-!CHECK: }
-!CHECK: }
-
-    !$omp atomic capture release
-        x = y
-        y = x + y
-    !$omp end atomic
-
-
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: omp.atomic.capture   hint(uncontended) {
-!CHECK: omp.atomic.update %[[Y]] : !fir.ref<i32> {
-!CHECK: ^bb0(%[[ARG:.*]]: i32):
-!CHECK: %[[result:.*]] = arith.muli %[[temp]], %[[ARG]] : i32
-!CHECK: omp.yield(%[[result]] : i32)
-!CHECK: }
-!CHECK: omp.atomic.read %[[X]] = %[[Y]] : !fir.ref<i32>
-!CHECK: }
-
-    !$omp atomic hint(omp_sync_hint_uncontended) capture
-        y = x * y 
-        x = y
-    !$omp end atomic
-
-!CHECK: %[[constant_20:.*]] = arith.constant 20 : i32
-!CHECK: %[[constant_8:.*]] = arith.constant 8 : i32
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: %[[result:.*]] = arith.subi %[[constant_8]], %[[temp]] : i32
-!CHECK: %[[result_noreassoc:.*]] = fir.no_reassoc %[[result]] : i32
-!CHECK: %[[result:.*]] = arith.addi %[[constant_20]], %[[result_noreassoc]] : i32
-!CHECK: omp.atomic.capture   memory_order(acquire) hint(nonspeculative) {
-!CHECK: omp.atomic.read %[[X]] = %[[Y]] : !fir.ref<i32>
-!CHECK: omp.atomic.write %[[Y]] = %[[result]] : !fir.ref<i32>, i32
-!CHECK: }
-
-    !$omp atomic hint(omp_lock_hint_nonspeculative) capture acquire
-        x = y
-        y = 2 * 10 + (8 - x) 
-    !$omp end atomic
-
-
-!CHECK: %[[constant_20:.*]] = arith.constant 20 : i32
-!CHECK: %[[constant_8:.*]] = arith.constant 8 : i32
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: %[[result:.*]] = arith.subi %[[constant_8]], %[[temp]] : i32
-!CHECK: %[[result_noreassoc:.*]] = fir.no_reassoc %[[result]] : i32
-!CHECK: %[[result:.*]] = arith.addi %[[constant_20]], %[[result_noreassoc]] : i32
-!CHECK: omp.atomic.capture {
-!CHECK: omp.atomic.read %[[X]] = %[[Y]] : !fir.ref<i32>
-!CHECK: omp.atomic.write %[[Y]] = %[[result]] : !fir.ref<i32>, i32
-!CHECK: }
-
-    !$omp atomic capture
-        x = y
-        y = 2 * 10 + (8 - x) 
-    !$omp end atomic 
-end program
-
-
-
-subroutine pointers_in_atomic_capture()
-!CHECK: %[[A:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "a", uniq_name = "_QFpointers_in_atomic_captureEa"}
-!CHECK: {{.*}} = fir.zero_bits !fir.ptr<i32>
-!CHECK: {{.*}} = fir.embox {{.*}} : (!fir.ptr<i32>) -> !fir.box<!fir.ptr<i32>>
-!CHECK: fir.store {{.*}} to %[[A]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK: %[[B:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "b", uniq_name = "_QFpointers_in_atomic_captureEb"}
-!CHECK: {{.*}} = fir.zero_bits !fir.ptr<i32>
-!CHECK: {{.*}} = fir.embox {{.*}} : (!fir.ptr<i32>) -> !fir.box<!fir.ptr<i32>>
-!CHECK: fir.store {{.*}} to %[[B]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK: %[[C:.*]] = fir.alloca i32 {bindc_name = "c", fir.target, uniq_name = "_QFpointers_in_atomic_captureEc"}
-!CHECK: %[[D:.*]] = fir.alloca i32 {bindc_name = "d", fir.target, uniq_name = "_QFpointers_in_atomic_captureEd"}
-!CHECK: {{.*}} = fir.embox {{.*}} : (!fir.ref<i32>) -> !fir.box<!fir.ptr<i32>>
-!CHECK: fir.store {{.*}} to %[[A]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK: {{.*}} = fir.embox {{.*}} : (!fir.ref<i32>) -> !fir.box<!fir.ptr<i32>>
-!CHECK: fir.store {{.*}} to %[[B]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK: %[[loaded_A:.*]] = fir.load %[[A]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK: %[[loaded_A_addr:.*]] = fir.box_addr %[[loaded_A]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
-!CHECK: %[[loaded_B:.*]] = fir.load %[[B]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK: %[[loaded_B_addr:.*]] = fir.box_addr %[[loaded_B]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
-!CHECK: %[[PRIVATE_LOADED_B:.*]] = fir.load %[[B]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK: %[[PRIVATE_LOADED_B_addr:.*]] = fir.box_addr %[[PRIVATE_LOADED_B]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
-!CHECK: %[[loaded_value:.*]] = fir.load %[[PRIVATE_LOADED_B_addr]] : !fir.ptr<i32>
-!CHECK: omp.atomic.capture   {
-!CHECK: omp.atomic.update %[[loaded_A_addr]] : !fir.ptr<i32> {
-!CHECK: ^bb0(%[[ARG:.*]]: i32):
-!CHECK: %[[result:.*]] = arith.addi %[[ARG]], %[[loaded_value]] : i32
-!CHECK: omp.yield(%[[result]] : i32)
-!CHECK: }
-!CHECK: omp.atomic.read %[[loaded_B_addr]] = %[[loaded_A_addr]] : !fir.ptr<i32>, i32
-!CHECK: }
-    integer, pointer :: a, b
-    integer, target :: c, d
-    a=>c
-    b=>d
-
-    !$omp atomic capture
-        a = a + b
-        b = a
-    !$omp end atomic
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/atomic-read.f90 b/flang/test/Lower/OpenMP/FIR/atomic-read.f90
deleted file mode 100644
index 7698c3d7490f..000000000000
--- a/flang/test/Lower/OpenMP/FIR/atomic-read.f90
+++ /dev/null
@@ -1,80 +0,0 @@
-! REQUIRES: openmp_runtime
-
-! RUN: bbc --use-desc-for-alloc=false -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-! This test checks the lowering of atomic read
-
-!CHECK: func @_QQmain() attributes {fir.bindc_name = "ompatomic"} {
-!CHECK: %[[VAR_A:.*]] = fir.alloca !fir.char<1> {bindc_name = "a", uniq_name = "_QFEa"}
-!CHECK: %[[VAR_B:.*]] = fir.alloca !fir.char<1> {bindc_name = "b", uniq_name = "_QFEb"}
-!CHECK: %[[VAR_C:.*]] = fir.alloca !fir.logical<4> {bindc_name = "c", uniq_name = "_QFEc"}
-!CHECK: %[[VAR_D:.*]] = fir.alloca !fir.logical<4> {bindc_name = "d", uniq_name = "_QFEd"}
-!CHECK: %[[VAR_E:.*]] = fir.alloca !fir.char<1,8> {bindc_name = "e", uniq_name = "_QFEe"}
-!CHECK: %[[VAR_F:.*]] = fir.alloca !fir.char<1,8> {bindc_name = "f", uniq_name = "_QFEf"}
-!CHECK: %[[VAR_G:.*]] = fir.alloca f32 {bindc_name = "g", uniq_name = "_QFEg"}
-!CHECK: %[[VAR_H:.*]] = fir.alloca f32 {bindc_name = "h", uniq_name = "_QFEh"}
-!CHECK: %[[VAR_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
-!CHECK: %[[VAR_Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
-!CHECK: omp.atomic.read %[[VAR_X]] = %[[VAR_Y]] memory_order(acquire)  hint(uncontended) : !fir.ref<i32>, i32
-!CHECK: omp.atomic.read %[[VAR_A]] = %[[VAR_B]] memory_order(relaxed) : !fir.ref<!fir.char<1>>, !fir.char<1>
-!CHECK: omp.atomic.read %[[VAR_C]] = %[[VAR_D]] memory_order(seq_cst)  hint(contended) : !fir.ref<!fir.logical<4>>, !fir.logical<4>
-!CHECK: omp.atomic.read %[[VAR_E]] = %[[VAR_F]] hint(speculative) : !fir.ref<!fir.char<1,8>>, !fir.char<1,8>
-!CHECK: omp.atomic.read %[[VAR_G]] = %[[VAR_H]] hint(nonspeculative) : !fir.ref<f32>, f32
-!CHECK: omp.atomic.read %[[VAR_G]] = %[[VAR_H]] : !fir.ref<f32>, f32
-!CHECK: return
-!CHECK: }
-
-program OmpAtomic
-
-    use omp_lib
-    integer :: x, y
-    character :: a, b
-    logical :: c, d
-    character(8) :: e, f
-    real g, h
-    !$omp atomic acquire read hint(omp_sync_hint_uncontended)
-       x = y
-    !$omp atomic relaxed read hint(omp_sync_hint_none)
-       a = b
-    !$omp atomic read seq_cst hint(omp_sync_hint_contended)
-       c = d
-    !$omp atomic read hint(omp_sync_hint_speculative)
-       e = f
-    !$omp atomic read hint(omp_sync_hint_nonspeculative)
-       g = h
-    !$omp atomic read
-       g = h
-end program OmpAtomic
-
-! Test lowering atomic read for pointer variables.
-! Please notice to use %[[VAL_4]] and %[[VAL_1]] for operands of atomic
-! operation, instead of %[[VAL_3]] and %[[VAL_0]].
-
-!CHECK-LABEL: func.func @_QPatomic_read_pointer() {
-!CHECK:         %[[VAL_0:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "x", uniq_name = "_QFatomic_read_pointerEx"}
-!CHECK:         %[[VAL_1:.*]] = fir.alloca !fir.ptr<i32> {uniq_name = "_QFatomic_read_pointerEx.addr"}
-!CHECK:         %[[VAL_2:.*]] = fir.zero_bits !fir.ptr<i32>
-!CHECK:         fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<!fir.ptr<i32>>
-!CHECK:         %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "y", uniq_name = "_QFatomic_read_pointerEy"}
-!CHECK:         %[[VAL_4:.*]] = fir.alloca !fir.ptr<i32> {uniq_name = "_QFatomic_read_pointerEy.addr"}
-!CHECK:         %[[VAL_5:.*]] = fir.zero_bits !fir.ptr<i32>
-!CHECK:         fir.store %[[VAL_5]] to %[[VAL_4]] : !fir.ref<!fir.ptr<i32>>
-!CHECK:         %[[VAL_6:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.ptr<i32>>
-!CHECK:         %[[VAL_7:.*]] = fir.load %[[VAL_4]] : !fir.ref<!fir.ptr<i32>>
-!CHECK:         omp.atomic.read %[[VAL_7]] = %[[VAL_6]]   : !fir.ptr<i32>, i32
-!CHECK:         %[[VAL_8:.*]] = fir.load %[[VAL_4]] : !fir.ref<!fir.ptr<i32>>
-!CHECK:         %[[VAL_9:.*]] = fir.load %[[VAL_8]] : !fir.ptr<i32>
-!CHECK:         %[[VAL_10:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.ptr<i32>>
-!CHECK:         fir.store %[[VAL_9]] to %[[VAL_10]] : !fir.ptr<i32>
-!CHECK:         return
-!CHECK:       }
-
-subroutine atomic_read_pointer()
-  integer, pointer :: x, y
-
-  !$omp atomic read
-    y = x
-
-  x = y
-end
-
diff --git a/flang/test/Lower/OpenMP/FIR/atomic-update.f90 b/flang/test/Lower/OpenMP/FIR/atomic-update.f90
deleted file mode 100644
index ae201807c337..000000000000
--- a/flang/test/Lower/OpenMP/FIR/atomic-update.f90
+++ /dev/null
@@ -1,141 +0,0 @@
-! REQUIRES: openmp_runtime
-
-! This test checks lowering of atomic and atomic update constructs
-! RUN: bbc --use-desc-for-alloc=false -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-! RUN: %flang_fc1 -mllvm --use-desc-for-alloc=false -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-program OmpAtomicUpdate
-    use omp_lib
-    integer :: x, y, z
-    integer, pointer :: a, b
-    integer, target :: c, d
-    integer(1) :: i1
-
-    a=>c
-    b=>d
-
-!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "ompatomicupdate"} {
-!CHECK: %[[A:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "a", uniq_name = "_QFEa"}
-!CHECK: %[[A_ADDR:.*]] = fir.alloca !fir.ptr<i32> {uniq_name = "_QFEa.addr"}
-!CHECK: %{{.*}} = fir.zero_bits !fir.ptr<i32>
-!CHECK: fir.store %{{.*}} to %[[A_ADDR]] : !fir.ref<!fir.ptr<i32>>
-!CHECK: %[[B:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "b", uniq_name = "_QFEb"}
-!CHECK: %[[B_ADDR:.*]] = fir.alloca !fir.ptr<i32> {uniq_name = "_QFEb.addr"}
-!CHECK: %{{.*}} = fir.zero_bits !fir.ptr<i32>
-!CHECK: fir.store %{{.*}} to %[[B_ADDR]] : !fir.ref<!fir.ptr<i32>>
-!CHECK: %[[C_ADDR:.*]] = fir.address_of(@_QFEc) : !fir.ref<i32>
-!CHECK: %[[D_ADDR:.*]] = fir.address_of(@_QFEd) : !fir.ref<i32>
-!CHECK: %[[I1:.*]] = fir.alloca i8 {bindc_name = "i1", uniq_name = "_QFEi1"}
-!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
-!CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
-!CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFEz"}
-!CHECK: %{{.*}} = fir.convert %[[C_ADDR]] : (!fir.ref<i32>) -> !fir.ptr<i32>
-!CHECK: fir.store %{{.*}} to %[[A_ADDR]] : !fir.ref<!fir.ptr<i32>>
-!CHECK: %{{.*}} = fir.convert %[[D_ADDR]] : (!fir.ref<i32>) -> !fir.ptr<i32>
-!CHECK: fir.store {{.*}} to %[[B_ADDR]] : !fir.ref<!fir.ptr<i32>>
-!CHECK: %[[LOADED_A:.*]] = fir.load %[[A_ADDR]] : !fir.ref<!fir.ptr<i32>>
-!CHECK: %[[LOADED_B:.*]] = fir.load %[[B_ADDR]] : !fir.ref<!fir.ptr<i32>>
-!CHECK: %{{.*}} = fir.load %[[LOADED_B]] : !fir.ptr<i32>
-!CHECK:  omp.atomic.update   %[[LOADED_A]] : !fir.ptr<i32> {
-!CHECK:  ^bb0(%[[ARG:.*]]: i32):
-!CHECK:    %[[RESULT:.*]] = arith.addi %[[ARG]], %{{.*}} : i32
-!CHECK:    omp.yield(%[[RESULT]] : i32)
-!CHECK: }
-    !$omp atomic update
-        a = a + b 
-
-!CHECK: {{.*}} = arith.constant 1 : i32
-!CHECK: omp.atomic.update   %[[Y]] : !fir.ref<i32> {
-!CHECK:  ^bb0(%[[ARG:.*]]: i32):
-!CHECK:    %[[RESULT:.*]] = arith.addi %[[ARG]], {{.*}} : i32
-!CHECK:    omp.yield(%[[RESULT]] : i32)
-!CHECK:  }
-!CHECK:  %[[LOADED_X:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK:  omp.atomic.update   %[[Z]] : !fir.ref<i32> {
-!CHECK:  ^bb0(%[[ARG:.*]]: i32):
-!CHECK:    %[[RESULT:.*]] = arith.muli %[[LOADED_X]], %[[ARG]] : i32
-!CHECK:    omp.yield(%[[RESULT]] : i32)
-!CHECK:  }
-    !$omp atomic 
-        y = y + 1
-    !$omp atomic update
-        z = x * z 
-
-!CHECK:  %{{.*}} = arith.constant 1 : i32
-!CHECK:  omp.atomic.update   memory_order(relaxed) hint(uncontended) %[[X]] : !fir.ref<i32> {
-!CHECK:  ^bb0(%[[ARG:.*]]: i32):
-!CHECK:    %[[RESULT:.*]] = arith.subi %[[ARG]], {{.*}} : i32
-!CHECK:    omp.yield(%[[RESULT]] : i32)
-!CHECK:  }
-!CHECK:  %[[LOADED_X:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK:  %[[LOADED_Z:.*]] = fir.load %[[Z]] : !fir.ref<i32>
-!CHECK:  omp.atomic.update   memory_order(relaxed) %[[Y]] : !fir.ref<i32> {
-!CHECK:  ^bb0(%[[ARG:.*]]: i32):
-!CHECK:    %{{.*}} = arith.cmpi sgt, %[[ARG]], %[[LOADED_X]] : i32
-!CHECK:    %{{.*}} = arith.select %{{.*}}, %[[ARG]], %[[LOADED_X]] : i32
-!CHECK:    %{{.*}} = arith.cmpi sgt, %{{.*}}, %[[LOADED_Z]] : i32
-!CHECK:    %[[RESULT:.*]] = arith.select %{{.*}}, %{{.*}}, %[[LOADED_Z]] : i32
-!CHECK:    omp.yield(%[[RESULT]] : i32)
-!CHECK:  }
-!CHECK:  %[[LOADED_X:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK:  omp.atomic.update   memory_order(relaxed) hint(contended) %[[Z]] : !fir.ref<i32> {
-!CHECK:  ^bb0(%[[ARG:.*]]: i32):
-!CHECK:    %[[RESULT:.*]] = arith.addi %[[ARG]], %[[LOADED_X]] : i32
-!CHECK:    omp.yield(%[[RESULT]] : i32)
-!CHECK:  }
-    !$omp atomic relaxed update hint(omp_sync_hint_uncontended)
-        x = x - 1
-    !$omp atomic update relaxed 
-        y = max(y, x, z)
-    !$omp atomic relaxed hint(omp_sync_hint_contended)
-        z = z + x
-
-!CHECK:  %{{.*}} = arith.constant 10 : i32
-!CHECK:  omp.atomic.update   memory_order(release) hint(contended) %[[Z]] : !fir.ref<i32> {
-!CHECK:  ^bb0(%[[ARG:.*]]: i32):
-!CHECK:   %[[RESULT:.*]] = arith.muli {{.*}}, %[[ARG]] : i32
-!CHECK:    omp.yield(%[[RESULT]] : i32)
-!CHECK:  }
-!CHECK:  %[[LOADED_Z:.*]] = fir.load %[[Z]] : !fir.ref<i32>
-!CHECK:  omp.atomic.update   memory_order(release) hint(speculative) %[[X]] : !fir.ref<i32> {
-!CHECK:  ^bb0(%[[ARG:.*]]: i32):
-!CHECK:    %[[RESULT:.*]] = arith.divsi %[[ARG]], %[[LOADED_Z]] : i32
-!CHECK:    omp.yield(%[[RESULT]] : i32)
-!CHECK:  }
-
-    !$omp atomic release update hint(omp_lock_hint_contended)
-        z = z * 10
-    !$omp atomic hint(omp_lock_hint_speculative) update release
-        x = x / z
-
-!CHECK:  %{{.*}} = arith.constant 10 : i32
-!CHECK:  omp.atomic.update   memory_order(seq_cst) hint(nonspeculative) %[[Y]] : !fir.ref<i32> {
-!CHECK:  ^bb0(%[[ARG:.*]]: i32):
-!CHECK:    %[[RESULT:.*]] = arith.addi %{{.*}}, %[[ARG]] : i32
-!CHECK:   omp.yield(%[[RESULT]] : i32)
-!CHECK:  }
-!CHECK:  %[[LOADED_Y:.*]] = fir.load %[[Y]] : !fir.ref<i32>
-!CHECK:  omp.atomic.update   memory_order(seq_cst) %[[Z]] : !fir.ref<i32> {
-!CHECK:  ^bb0(%[[ARG:.*]]: i32):
-!CHECK:    %[[RESULT:.*]] = arith.addi %[[LOADED_Y]], %[[ARG]] : i32
-!CHECK:    omp.yield(%[[RESULT]] : i32)
-!CHECK:  }
-    !$omp atomic hint(omp_sync_hint_nonspeculative) seq_cst
-        y = 10 + y
-    !$omp atomic seq_cst update
-        z = y + z
-
-!CHECK:  %[[C1_VAL:.*]] = arith.constant 1 : i32
-!CHECK:  omp.atomic.update   %[[I1]] : !fir.ref<i8> {
-!CHECK:  ^bb0(%[[VAL:.*]]: i8):
-!CHECK:    %[[CVT_VAL:.*]] = fir.convert %[[VAL]] : (i8) -> i32
-!CHECK:    %[[ADD_VAL:.*]] = arith.addi %[[CVT_VAL]], %[[C1_VAL]] : i32
-!CHECK:    %[[UPDATED_VAL:.*]] = fir.convert %[[ADD_VAL]] : (i32) -> i8
-!CHECK:    omp.yield(%[[UPDATED_VAL]] : i8)
-!CHECK:  }
-    !$omp atomic
-      i1 = i1 + 1
-    !$omp end atomic
-!CHECK:  return
-!CHECK: }
-end program OmpAtomicUpdate
diff --git a/flang/test/Lower/OpenMP/FIR/atomic-write.f90 b/flang/test/Lower/OpenMP/FIR/atomic-write.f90
deleted file mode 100644
index 142481b7a1d2..000000000000
--- a/flang/test/Lower/OpenMP/FIR/atomic-write.f90
+++ /dev/null
@@ -1,77 +0,0 @@
-! REQUIRES: openmp_runtime
-
-! RUN: bbc --use-desc-for-alloc=false -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-! This test checks the lowering of atomic write
-
-!CHECK: func @_QQmain() attributes {fir.bindc_name = "ompatomicwrite"} {
-!CHECK: %[[VAR_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
-!CHECK: %[[VAR_Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
-!CHECK: %[[VAR_Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFEz"}
-!CHECK: %[[CONST_44:.*]] = arith.constant 44 : i32
-!CHECK: omp.atomic.write %[[VAR_X]] = %[[CONST_44]] hint(uncontended) memory_order(seq_cst) : !fir.ref<i32>, i32
-!CHECK: %[[CONST_7:.*]] = arith.constant 7 : i32
-!CHECK: {{.*}} = fir.load %[[VAR_Y]] : !fir.ref<i32>
-!CHECK: %[[VAR_7y:.*]] = arith.muli %[[CONST_7]], {{.*}} : i32
-!CHECK: omp.atomic.write %[[VAR_X]] = %[[VAR_7y]] memory_order(relaxed) : !fir.ref<i32>, i32
-!CHECK: %[[CONST_10:.*]] = arith.constant 10 : i32
-!CHECK: {{.*}} = fir.load %[[VAR_X]] : !fir.ref<i32>
-!CHECK: {{.*}} = arith.muli %[[CONST_10]], {{.*}} : i32
-!CHECK: {{.*}} = fir.load %[[VAR_Z]] : !fir.ref<i32>
-!CHECK: %[[CONST_2:.*]] = arith.constant 2 : i32
-!CHECK: {{.*}} = arith.divsi {{.*}}, %[[CONST_2]] : i32
-!CHECK: {{.*}} = arith.addi {{.*}}, {{.*}} : i32
-!CHECK: omp.atomic.write %[[VAR_Y]] = {{.*}} hint(speculative) memory_order(release) : !fir.ref<i32>, i32
-!CHECK: return
-!CHECK: }
-
-program OmpAtomicWrite
-    use omp_lib
-    integer :: x, y, z
-    !$omp atomic seq_cst write hint(omp_sync_hint_uncontended)
-        x = 8*4 + 12
-
-    !$omp atomic write relaxed
-        x = 7 * y
-
-    !$omp atomic write release hint(omp_sync_hint_speculative)
-        y = 10*x + z/2
-end program OmpAtomicWrite
-
-! Test lowering atomic read for pointer variables.
-! Please notice to use %[[VAL_1]] for operands of atomic operation, instead
-! of %[[VAL_0]].
-
-!CHECK-LABEL: func.func @_QPatomic_write_pointer() {
-!CHECK:         %[[VAL_0:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "x", uniq_name = "_QFatomic_write_pointerEx"}
-!CHECK:         %[[VAL_1:.*]] = fir.alloca !fir.ptr<i32> {uniq_name = "_QFatomic_write_pointerEx.addr"}
-!CHECK:         %[[VAL_2:.*]] = fir.zero_bits !fir.ptr<i32>
-!CHECK:         fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<!fir.ptr<i32>>
-!CHECK:         %[[VAL_3:.*]] = arith.constant 1 : i32
-!CHECK:         %[[VAL_4:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.ptr<i32>>
-!CHECK:         omp.atomic.write %[[VAL_4]] = %[[VAL_3]]   : !fir.ptr<i32>, i32
-!CHECK:         %[[VAL_5:.*]] = arith.constant 2 : i32
-!CHECK:         %[[VAL_6:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.ptr<i32>>
-!CHECK:         fir.store %[[VAL_5]] to %[[VAL_6]] : !fir.ptr<i32>
-!CHECK:         return
-!CHECK:       }
-
-subroutine atomic_write_pointer()
-  integer, pointer :: x
-
-  !$omp atomic write
-    x = 1
-
-  x = 2
-end
-
-!CHECK-LABEL: func.func @_QPatomic_write_typed_assign
-!CHECK: %[[VAR:.*]] = fir.alloca f32 {bindc_name = "r2", uniq_name = "{{.*}}r2"}
-!CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-!CHECK: omp.atomic.write %[[VAR]] = %[[CST]]   : !fir.ref<f32>, f32
-
-subroutine atomic_write_typed_assign
-  real :: r2
-  !$omp atomic write
-  r2 = 0
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/copyin.f90 b/flang/test/Lower/OpenMP/FIR/copyin.f90
deleted file mode 100644
index e256404d3d55..000000000000
--- a/flang/test/Lower/OpenMP/FIR/copyin.f90
+++ /dev/null
@@ -1,358 +0,0 @@
-! This test checks lowering of `COPYIN` clause.
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-! CHECK-LABEL: func.func @_QPcopyin_scalar_array() {
-! CHECK:         %[[VAL_0:.*]] = fir.address_of(@_QFcopyin_scalar_arrayEx1) : !fir.ref<i32>
-! CHECK:         %[[VAL_1:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<i32> -> !fir.ref<i32>
-! CHECK:         %[[VAL_2:.*]] = fir.address_of(@_QFcopyin_scalar_arrayEx2) : !fir.ref<!fir.array<10xi64>>
-! CHECK:         %[[VAL_3:.*]] = arith.constant 10 : index
-! CHECK:         %[[VAL_4:.*]] = omp.threadprivate %[[VAL_2]] : !fir.ref<!fir.array<10xi64>> -> !fir.ref<!fir.array<10xi64>>
-! CHECK:         omp.parallel   {
-! CHECK:           %[[VAL_5:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<i32> -> !fir.ref<i32>
-! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_5]] : !fir.ref<i32>
-! CHECK:           %[[VAL_7:.*]] = omp.threadprivate %[[VAL_2]] : !fir.ref<!fir.array<10xi64>> -> !fir.ref<!fir.array<10xi64>>
-! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_9:.*]] = fir.array_load %[[VAL_7]](%[[VAL_8]]) : (!fir.ref<!fir.array<10xi64>>, !fir.shape<1>) -> !fir.array<10xi64>
-! CHECK:           %[[VAL_10:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_11:.*]] = fir.array_load %[[VAL_4]](%[[VAL_10]]) : (!fir.ref<!fir.array<10xi64>>, !fir.shape<1>) -> !fir.array<10xi64>
-! CHECK:           %[[VAL_12:.*]] = arith.constant 1 : index
-! CHECK:           %[[VAL_13:.*]] = arith.constant 0 : index
-! CHECK:           %[[VAL_14:.*]] = arith.subi %[[VAL_3]], %[[VAL_12]] : index
-! CHECK:           %[[VAL_15:.*]] = fir.do_loop %[[VAL_16:.*]] = %[[VAL_13]] to %[[VAL_14]] step %[[VAL_12]] unordered iter_args(%[[VAL_17:.*]] = %[[VAL_9]]) -> (!fir.array<10xi64>) {
-! CHECK:             %[[VAL_18:.*]] = fir.array_fetch %[[VAL_11]], %[[VAL_16]] : (!fir.array<10xi64>, index) -> i64
-! CHECK:             %[[VAL_19:.*]] = fir.array_update %[[VAL_17]], %[[VAL_18]], %[[VAL_16]] : (!fir.array<10xi64>, i64, index) -> !fir.array<10xi64>
-! CHECK:             fir.result %[[VAL_19]] : !fir.array<10xi64>
-! CHECK:           }
-! CHECK:           fir.array_merge_store %[[VAL_9]], %[[VAL_20:.*]] to %[[VAL_7]] : !fir.array<10xi64>, !fir.array<10xi64>, !fir.ref<!fir.array<10xi64>>
-! CHECK:           omp.barrier
-! CHECK:           fir.call @_QPsub1(%[[VAL_5]], %[[VAL_7]]) {{.*}}: (!fir.ref<i32>, !fir.ref<!fir.array<10xi64>>) -> ()
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-subroutine copyin_scalar_array()
-  integer(kind=4), save :: x1
-  integer(kind=8), save :: x2(10)
-  !$omp threadprivate(x1, x2)
-
-  !$omp parallel copyin(x1) copyin(x2)
-    call sub1(x1, x2)
-  !$omp end parallel
-
-end
-
-! CHECK-LABEL: func.func @_QPcopyin_char_chararray() {
-! CHECK:         %[[VAL_0:.*]] = fir.address_of(@_QFcopyin_char_chararrayEx3) : !fir.ref<!fir.char<1,5>>
-! CHECK:         %[[VAL_1:.*]] = arith.constant 5 : index
-! CHECK:         %[[VAL_2:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<!fir.char<1,5>> -> !fir.ref<!fir.char<1,5>>
-! CHECK:         %[[VAL_3:.*]] = fir.address_of(@_QFcopyin_char_chararrayEx4) : !fir.ref<!fir.array<10x!fir.char<1,5>>>
-! CHECK:         %[[VAL_4:.*]] = arith.constant 5 : index
-! CHECK:         %[[VAL_5:.*]] = arith.constant 10 : index
-! CHECK:         %[[VAL_6:.*]] = omp.threadprivate %[[VAL_3]] : !fir.ref<!fir.array<10x!fir.char<1,5>>> -> !fir.ref<!fir.array<10x!fir.char<1,5>>>
-! CHECK:         omp.parallel   {
-! CHECK:           %[[VAL_7:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<!fir.char<1,5>> -> !fir.ref<!fir.char<1,5>>
-! CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i64
-! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_1]] : (index) -> i64
-! CHECK:           %[[VAL_10:.*]] = arith.muli %[[VAL_8]], %[[VAL_9]] : i64
-! CHECK:           %[[VAL_11:.*]] = arith.constant false
-! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
-! CHECK:           fir.call @llvm.memmove.p0.p0.i64(%[[VAL_12]], %[[VAL_13]], %[[VAL_10]], %[[VAL_11]]) {{.*}}: (!fir.ref<i8>, !fir.ref<i8>, i64, i1) -> ()
-! CHECK:           %[[VAL_14:.*]] = omp.threadprivate %[[VAL_3]] : !fir.ref<!fir.array<10x!fir.char<1,5>>> -> !fir.ref<!fir.array<10x!fir.char<1,5>>>
-! CHECK:           %[[VAL_15:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_16:.*]] = fir.array_load %[[VAL_14]](%[[VAL_15]]) : (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.array<10x!fir.char<1,5>>
-! CHECK:           %[[VAL_17:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
-! CHECK:           %[[VAL_18:.*]] = fir.array_load %[[VAL_6]](%[[VAL_17]]) : (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.array<10x!fir.char<1,5>>
-! CHECK:           %[[VAL_19:.*]] = arith.constant 1 : index
-! CHECK:           %[[VAL_20:.*]] = arith.constant 0 : index
-! CHECK:           %[[VAL_21:.*]] = arith.subi %[[VAL_5]], %[[VAL_19]] : index
-! CHECK:           %[[VAL_22:.*]] = fir.do_loop %[[VAL_23:.*]] = %[[VAL_20]] to %[[VAL_21]] step %[[VAL_19]] unordered iter_args(%[[VAL_24:.*]] = %[[VAL_16]]) -> (!fir.array<10x!fir.char<1,5>>) {
-! CHECK:             %[[VAL_25:.*]] = fir.array_access %[[VAL_18]], %[[VAL_23]] : (!fir.array<10x!fir.char<1,5>>, index) -> !fir.ref<!fir.char<1,5>>
-! CHECK:             %[[VAL_26:.*]] = fir.array_access %[[VAL_24]], %[[VAL_23]] : (!fir.array<10x!fir.char<1,5>>, index) -> !fir.ref<!fir.char<1,5>>
-! CHECK:             %[[VAL_27:.*]] = arith.constant 5 : index
-! CHECK:             %[[VAL_28:.*]] = arith.constant 1 : i64
-! CHECK:             %[[VAL_29:.*]] = fir.convert %[[VAL_27]] : (index) -> i64
-! CHECK:             %[[VAL_30:.*]] = arith.muli %[[VAL_28]], %[[VAL_29]] : i64
-! CHECK:             %[[VAL_31:.*]] = arith.constant false
-! CHECK:             %[[VAL_32:.*]] = fir.convert %[[VAL_26]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
-! CHECK:             %[[VAL_33:.*]] = fir.convert %[[VAL_25]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
-! CHECK:             fir.call @llvm.memmove.p0.p0.i64(%[[VAL_32]], %[[VAL_33]], %[[VAL_30]], %[[VAL_31]]) {{.*}}: (!fir.ref<i8>, !fir.ref<i8>, i64, i1) -> ()
-! CHECK:             %[[VAL_34:.*]] = fir.array_amend %[[VAL_24]], %[[VAL_26]] : (!fir.array<10x!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>) -> !fir.array<10x!fir.char<1,5>>
-! CHECK:             fir.result %[[VAL_34]] : !fir.array<10x!fir.char<1,5>>
-! CHECK:           }
-! CHECK:           fir.array_merge_store %[[VAL_16]], %[[VAL_35:.*]] to %[[VAL_14]] : !fir.array<10x!fir.char<1,5>>, !fir.array<10x!fir.char<1,5>>, !fir.ref<!fir.array<10x!fir.char<1,5>>>
-! CHECK:           omp.barrier
-! CHECK:           %[[VAL_37:.*]] = fir.emboxchar %[[VAL_7]], %[[VAL_1]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
-! CHECK:           %[[VAL_38:.*]] = fir.convert %[[VAL_14]] : (!fir.ref<!fir.array<10x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,?>>
-! CHECK:           %[[VAL_39:.*]] = fir.emboxchar %[[VAL_38]], %[[VAL_4]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
-! CHECK:           fir.call @_QPsub2(%[[VAL_37]], %[[VAL_39]]) {{.*}}: (!fir.boxchar<1>, !fir.boxchar<1>) -> ()
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-subroutine copyin_char_chararray()
-  character(5), save :: x3, x4(10)
-  !$omp threadprivate(x3, x4)
-
-  !$omp parallel copyin(x3) copyin(x4)
-    call sub2(x3, x4)
-  !$omp end parallel
-
-end
-
-! CHECK-LABEL: func.func @_QPcopyin_derived_type() {
-! CHECK:         %[[VAL_0:.*]] = fir.address_of(@_QFcopyin_derived_typeEx5) : !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>
-! CHECK:         %[[VAL_1:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>> -> !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>
-! CHECK:         omp.parallel   {
-! CHECK:           %[[VAL_2:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>> -> !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>
-! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>
-! CHECK:           fir.store %[[VAL_3]] to %[[VAL_2]] : !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>
-! CHECK:           omp.barrier
-! CHECK:           fir.call @_QPsub3(%[[VAL_2]]) {{.*}}: (!fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>) -> ()
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-subroutine copyin_derived_type()
-  type my_type
-    integer :: t_i
-    integer :: t_arr(5)
-  end type my_type
-  type(my_type), save :: x5
-  !$omp threadprivate(x5)
-
-  !$omp parallel copyin(x5)
-    call sub3(x5)
-  !$omp end parallel
-
-end
-
-! CHECK-LABEL: func.func @_QPcombined_parallel_worksharing_loop() {
-! CHECK:         %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFcombined_parallel_worksharing_loopEi"}
-! CHECK:         %[[VAL_1:.*]] = fir.address_of(@_QFcombined_parallel_worksharing_loopEx6) : !fir.ref<i32>
-! CHECK:         %[[VAL_2:.*]] = omp.threadprivate %[[VAL_1]] : !fir.ref<i32> -> !fir.ref<i32>
-! CHECK:         omp.parallel   {
-! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:           %[[VAL_4:.*]] = omp.threadprivate %[[VAL_1]] : !fir.ref<i32> -> !fir.ref<i32>
-! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_2]] : !fir.ref<i32>
-! CHECK:           fir.store %[[VAL_5]] to %[[VAL_4]] : !fir.ref<i32>
-! CHECK:           omp.barrier
-! CHECK:           %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_7:.*]] = fir.load %[[VAL_4]] : !fir.ref<i32>
-! CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:           omp.wsloop {
-! CHECK-NEXT:        omp.loop_nest (%[[VAL_9:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
-! CHECK:               fir.store %[[VAL_9]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:               fir.call @_QPsub4(%[[VAL_4]]) {{.*}}: (!fir.ref<i32>) -> ()
-! CHECK:               omp.yield
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-subroutine combined_parallel_worksharing_loop()
-  integer, save :: x6
-  !$omp threadprivate(x6)
-
-  !$omp parallel do copyin(x6)
-    do i=1, x6
-      call sub4(x6)
-    end do
-  !$omp end parallel do
-
-end
-
-! CHECK-LABEL: func.func @_QPcombined_parallel_sections() {
-! CHECK:         %[[VAL_0:.*]] = fir.address_of(@_QFcombined_parallel_sectionsEx7) : !fir.ref<i32>
-! CHECK:         %[[VAL_1:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<i32> -> !fir.ref<i32>
-! CHECK:         omp.parallel   {
-! CHECK:           %[[VAL_2:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<i32> -> !fir.ref<i32>
-! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
-! CHECK:           fir.store %[[VAL_3]] to %[[VAL_2]] : !fir.ref<i32>
-! CHECK:           omp.barrier
-! CHECK:           omp.sections   {
-! CHECK:             omp.section {
-! CHECK:               fir.call @_QPsub5(%[[VAL_2]]) {{.*}}: (!fir.ref<i32>) -> ()
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.section {
-! CHECK:               fir.call @_QPsub6(%[[VAL_2]]) {{.*}}: (!fir.ref<i32>) -> ()
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-subroutine combined_parallel_sections()
-  integer, save :: x7
-  !$omp threadprivate(x7)
-
-  !$omp parallel sections copyin(x7)
-    !$omp section
-      call sub5(x7)
-    !$omp section
-      call sub6(x7)
-  !$omp end parallel sections
-
-end
-
-
-!CHECK: func.func @_QPcommon_1() {
-!CHECK: %[[val_0:.*]] = fir.address_of(@c_) : !fir.ref<!fir.array<4xi8>>
-!CHECK: %[[val_1:.*]] = omp.threadprivate %[[val_0]] : !fir.ref<!fir.array<4xi8>> -> !fir.ref<!fir.array<4xi8>>
-!CHECK: %[[val_2:.*]] = fir.convert %[[val_1]] : (!fir.ref<!fir.array<4xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c0:.*]] = arith.constant 0 : index
-!CHECK: %[[val_3:.*]] = fir.coordinate_of %[[val_2]], %[[val_c0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_4:.*]] = fir.convert %[[val_3]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: %[[val_5:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFcommon_1Ey"}
-!CHECK: omp.parallel {
-!CHECK: %[[val_6:.*]] = omp.threadprivate %[[val_0]] : !fir.ref<!fir.array<4xi8>> -> !fir.ref<!fir.array<4xi8>>
-!CHECK: %[[val_7:.*]] = fir.convert %[[val_6]] : (!fir.ref<!fir.array<4xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c0_0:.*]] = arith.constant 0 : index
-!CHECK: %[[val_8:.*]] = fir.coordinate_of %[[val_7]], %[[val_c0_0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_9:.*]] = fir.convert %[[val_8]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: %[[val_10:.*]] = fir.load %[[val_4]] : !fir.ref<i32>
-!CHECK: fir.store %[[val_10]] to %[[val_9]] : !fir.ref<i32>
-!CHECK: omp.barrier
-!CHECK: omp.sections   {
-!CHECK: omp.section {
-!CHECK: %[[val_11:.*]] = fir.load %[[val_9]] : !fir.ref<i32>
-!CHECK: %[[val_c1_i32:.*]] = arith.constant 1 : i32
-!CHECK: %[[val_12:.*]] = arith.addi %[[val_11]], %[[val_c1_i32]] : i32
-!CHECK: fir.store %[[val_12]] to %[[val_5]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.section {
-!CHECK: %[[val_11:.*]] = fir.load %[[val_5]] : !fir.ref<i32>
-!CHECK: %[[val_12:.*]] = fir.load %[[val_5]] : !fir.ref<i32>
-!CHECK: %[[val_13:.*]] = arith.muli %[[val_11]], %[[val_12]] : i32
-!CHECK: fir.store %[[val_13]] to %[[val_9]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: return
-!CHECK: }
-subroutine common_1()
-  integer :: x
-  integer :: y
-  common /c/ x
-  !$omp threadprivate(/c/)
-
-  !$omp parallel sections copyin(/c/)
-      !$omp section
-        y = x + 1
-      !$omp section
-        x = y * y
-  !$omp end parallel sections
-end subroutine
-
-!CHECK: func.func @_QPcommon_2() {
-!CHECK: %[[val_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFcommon_2Ei"}
-!CHECK: %[[val_1:.*]] = fir.address_of(@d_) : !fir.ref<!fir.array<8xi8>>
-!CHECK: %[[val_2:.*]] = omp.threadprivate %[[val_1]] : !fir.ref<!fir.array<8xi8>> -> !fir.ref<!fir.array<8xi8>>
-!CHECK: %[[val_3:.*]] = fir.convert %[[val_2]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c0:.*]] = arith.constant 0 : index
-!CHECK: %[[val_4:.*]] = fir.coordinate_of %[[val_3]], %[[val_c0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_5:.*]] = fir.convert %[[val_4]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: %[[val_6:.*]] = fir.convert %[[val_2]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c4:.*]] = arith.constant 4 : index
-!CHECK: %[[val_7:.*]] = fir.coordinate_of %[[val_6]], %[[val_c4]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_8:.*]] = fir.convert %[[val_7]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: omp.parallel {
-!CHECK: %[[val_9:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-!CHECK: %[[val_10:.*]] = omp.threadprivate %[[val_1]] : !fir.ref<!fir.array<8xi8>> -> !fir.ref<!fir.array<8xi8>>
-!CHECK: %[[val_11:.*]] = fir.convert %[[val_10]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c0_0:.*]] = arith.constant 0 : index
-!CHECK: %[[val_12:.*]] = fir.coordinate_of %[[val_11]], %[[val_c0_0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_13:.*]] = fir.convert %[[val_12]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: %[[val_14:.*]] = fir.convert %[[val_10]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c4_1:.*]] = arith.constant 4 : index
-!CHECK: %[[val_15:.*]] = fir.coordinate_of %[[val_14]], %[[val_c4_1]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_16:.*]] = fir.convert %[[val_15]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: %[[val_17:.*]] = fir.load %[[val_5]] : !fir.ref<i32>
-!CHECK: fir.store %[[val_17]] to %[[val_13]] : !fir.ref<i32>
-!CHECK: %[[val_18:.*]] = fir.load %[[val_8]] : !fir.ref<i32>
-!CHECK: fir.store %[[val_18]] to %[[val_16]] : !fir.ref<i32>
-!CHECK: omp.barrier
-!CHECK: %[[val_c1_i32:.*]] = arith.constant 1 : i32
-!CHECK: %[[val_19:.*]] = fir.load %[[val_13]] : !fir.ref<i32>
-!CHECK: %[[val_c1_i32_2:.*]] = arith.constant 1 : i32
-!CHECK: omp.wsloop {
-!CHECK-NEXT: omp.loop_nest (%[[arg:.*]]) : i32 = (%[[val_c1_i32]]) to (%[[val_19]]) inclusive step (%[[val_c1_i32_2]]) {
-!CHECK: fir.store %[[arg]] to %[[val_9]] : !fir.ref<i32>
-!CHECK: %[[val_20:.*]] = fir.load %[[val_16]] : !fir.ref<i32>
-!CHECK: %[[val_21:.*]] = fir.load %[[val_9]] : !fir.ref<i32>
-!CHECK: %[[val_22:.*]] = arith.addi %[[val_20]], %[[val_21]] : i32
-!CHECK: fir.store %[[val_22]] to %[[val_16]] : !fir.ref<i32>
-!CHECK: omp.yield
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: return
-!CHECK: }
-subroutine common_2()
-  integer :: x
-  integer :: y
-  common /d/ x, y
-  !$omp threadprivate(/d/)
-  
-  !$omp parallel do copyin(/d/)
-     do i = 1, x
-        y = y + i
-     end do
-  !$omp end parallel do
-end subroutine
-
-!CHECK: func.func @_QPcommon_3() {
-!CHECK: %[[val_0:.*]] = fir.address_of(@blk_) : !fir.ref<!fir.array<8xi8>>
-!CHECK: %[[val_1:.*]] = omp.threadprivate %[[val_0]] : !fir.ref<!fir.array<8xi8>> -> !fir.ref<!fir.array<8xi8>>
-!CHECK: %[[val_2:.*]] = fir.convert %[[val_1]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c4:.*]] = arith.constant 4 : index
-!CHECK: %[[val_3:.*]] = fir.coordinate_of %[[val_2]], %[[val_c4]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_4:.*]] = fir.convert %[[val_3]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: omp.parallel {
-!CHECK: %[[val_5:.*]] = omp.threadprivate %[[val_0]] : !fir.ref<!fir.array<8xi8>> -> !fir.ref<!fir.array<8xi8>>
-!CHECK: %[[val_6:.*]] = fir.convert %[[val_5]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c4_0:.*]] = arith.constant 4 : index
-!CHECK: %[[val_7:.*]] = fir.coordinate_of %[[val_6]], %[[val_c4_0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_8:.*]] = fir.convert %[[val_7]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: %[[val_9:.*]] = fir.load %[[val_4]] : !fir.ref<i32>
-!CHECK: fir.store %[[val_9]] to %[[val_8]] : !fir.ref<i32>
-!CHECK: omp.barrier
-!CHECK: omp.sections {
-!CHECK: omp.section {
-!CHECK: %[[val_10:.*]] = fir.load %[[val_8]] : !fir.ref<i32>
-!CHECK: %[[val_c3_i32:.*]] = arith.constant 3 : i32
-!CHECK: %[[val_11:.*]] = arith.addi %[[val_10]], %[[val_c3_i32]] : i32
-!CHECK: fir.store %[[val_11]] to %[[val_8]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: return
-!CHECK: }
-subroutine common_3()
-  integer :: x
-  integer :: y
-  common /blk/ x, y
-  !$omp threadprivate (/blk/)
-
-  !$omp parallel sections copyin(/blk/)
-        !$omp section
-            y = y + 3
-  !$omp end parallel sections
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/critical.f90 b/flang/test/Lower/OpenMP/FIR/critical.f90
deleted file mode 100644
index fa33fb0fe58b..000000000000
--- a/flang/test/Lower/OpenMP/FIR/critical.f90
+++ /dev/null
@@ -1,38 +0,0 @@
-! REQUIRES: openmp_runtime
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes="OMPDialect"
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefix="OMPDialect"
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --fir-to-llvm-ir | tco | FileCheck %s --check-prefix="LLVMIR"
-
-!OMPDialect: omp.critical.declare @help2
-!OMPDialect: omp.critical.declare @help1 hint(contended)
-
-subroutine omp_critical()
-  use omp_lib
-  integer :: x, y
-!OMPDialect: omp.critical(@help1)
-!LLVMIR: call void @__kmpc_critical_with_hint({{.*}}, {{.*}}, {{.*}} @{{.*}}help1.var, i32 2)
-!$OMP CRITICAL(help1) HINT(omp_lock_hint_contended)
-  x = x + y
-!OMPDialect: omp.terminator
-!LLVMIR: call void @__kmpc_end_critical({{.*}}, {{.*}}, {{.*}} @{{.*}}help1.var)
-!$OMP END CRITICAL(help1)
-
-! Test that the same name can be used again
-! Also test with the zero hint expression
-!OMPDialect: omp.critical(@help2)
-!LLVMIR: call void @__kmpc_critical_with_hint({{.*}}, {{.*}}, {{.*}} @{{.*}}help2.var, i32 0)
-!$OMP CRITICAL(help2) HINT(omp_lock_hint_none)
-  x = x - y
-!OMPDialect: omp.terminator
-!LLVMIR: call void @__kmpc_end_critical({{.*}}, {{.*}}, {{.*}} @{{.*}}help2.var)
-!$OMP END CRITICAL(help2)
-
-!OMPDialect: omp.critical
-!LLVMIR: call void @__kmpc_critical({{.*}}, {{.*}}, {{.*}} @{{.*}}_.var)
-!$OMP CRITICAL
-  y = x + y
-!OMPDialect: omp.terminator
-!LLVMIR: call void @__kmpc_end_critical({{.*}}, {{.*}}, {{.*}} @{{.*}}_.var)
-!$OMP END CRITICAL
-end subroutine omp_critical
diff --git a/flang/test/Lower/OpenMP/FIR/declare-target-data.f90 b/flang/test/Lower/OpenMP/FIR/declare-target-data.f90
deleted file mode 100644
index bb3bbc8dfa83..000000000000
--- a/flang/test/Lower/OpenMP/FIR/declare-target-data.f90
+++ /dev/null
@@ -1,88 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s 
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s
-
-module test_0
-    implicit none
-
-!CHECK-DAG: fir.global @_QMtest_0Edata_int {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : i32
-INTEGER :: data_int = 10
-!$omp declare target link(data_int)
-
-!CHECK-DAG: fir.global @_QMtest_0Earray_1d({{.*}}) {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : !fir.array<3xi32>
-INTEGER :: array_1d(3) = (/1,2,3/)
-!$omp declare target link(array_1d)
-
-!CHECK-DAG: fir.global @_QMtest_0Earray_2d({{.*}}) {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : !fir.array<2x2xi32> 
-INTEGER :: array_2d(2,2) = reshape((/1,2,3,4/), (/2,2/))
-!$omp declare target link(array_2d)
-
-!CHECK-DAG: fir.global @_QMtest_0Ept1 {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : !fir.box<!fir.ptr<i32>>
-INTEGER, POINTER :: pt1
-!$omp declare target link(pt1)
-
-!CHECK-DAG: fir.global @_QMtest_0Ept2_tar {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} target : i32
-INTEGER, TARGET :: pt2_tar = 5 
-!$omp declare target link(pt2_tar)
-
-!CHECK-DAG: fir.global @_QMtest_0Ept2 {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : !fir.box<!fir.ptr<i32>>
-INTEGER, POINTER :: pt2 => pt2_tar
-!$omp declare target link(pt2)
-
-!CHECK-DAG: fir.global @_QMtest_0Edata_int_to {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : i32
-INTEGER :: data_int_to = 5
-!$omp declare target to(data_int_to)
-
-!CHECK-DAG: fir.global @_QMtest_0Edata_int_enter {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>} : i32
-INTEGER :: data_int_enter = 5
-!$omp declare target enter(data_int_enter)
-
-!CHECK-DAG: fir.global @_QMtest_0Edata_int_clauseless {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : i32
-INTEGER :: data_int_clauseless = 1
-!$omp declare target(data_int_clauseless)
-
-!CHECK-DAG: fir.global @_QMtest_0Edata_extended_to_1 {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : f32
-!CHECK-DAG: fir.global @_QMtest_0Edata_extended_to_2 {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : f32
-REAL :: data_extended_to_1 = 2
-REAL :: data_extended_to_2 = 3
-!$omp declare target to(data_extended_to_1, data_extended_to_2)
-
-!CHECK-DAG: fir.global @_QMtest_0Edata_extended_enter_1 {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>} : f32
-!CHECK-DAG: fir.global @_QMtest_0Edata_extended_enter_2 {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>} : f32
-REAL :: data_extended_enter_1 = 2
-REAL :: data_extended_enter_2 = 3
-!$omp declare target enter(data_extended_enter_1, data_extended_enter_2)
-
-!CHECK-DAG: fir.global @_QMtest_0Edata_extended_link_1 {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : f32
-!CHECK-DAG: fir.global @_QMtest_0Edata_extended_link_2 {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : f32
-REAL :: data_extended_link_1 = 2
-REAL :: data_extended_link_2 = 3
-!$omp declare target link(data_extended_link_1, data_extended_link_2)
-
-contains
-end module test_0
-
-PROGRAM commons
-    !CHECK-DAG: fir.global @numbers_ {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : tuple<f32, f32> {
-    REAL :: one = 1
-    REAL :: two = 2
-    COMMON /numbers/ one, two
-    !$omp declare target(/numbers/)
-    
-    !CHECK-DAG: fir.global @numbers_link_ {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : tuple<f32, f32> {
-    REAL :: one_link = 1
-    REAL :: two_link = 2
-    COMMON /numbers_link/ one_link, two_link
-    !$omp declare target link(/numbers_link/)
-
-    !CHECK-DAG: fir.global @numbers_to_ {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : tuple<f32, f32> {
-    REAL :: one_to = 1
-    REAL :: two_to = 2
-    COMMON /numbers_to/ one_to, two_to
-    !$omp declare target to(/numbers_to/)
-
-    !CHECK-DAG: fir.global @numbers_enter_ {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>} : tuple<f32, f32> {
-    REAL :: one_enter = 1
-    REAL :: two_enter = 2
-    COMMON /numbers_enter/ one_enter, two_enter
-    !$omp declare target enter(/numbers_enter/)
-END
diff --git a/flang/test/Lower/OpenMP/FIR/declare-target-func-and-subr.f90 b/flang/test/Lower/OpenMP/FIR/declare-target-func-and-subr.f90
deleted file mode 100644
index 36d4d7db64e5..000000000000
--- a/flang/test/Lower/OpenMP/FIR/declare-target-func-and-subr.f90
+++ /dev/null
@@ -1,178 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s --check-prefixes ALL,HOST
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-device %s -o - | FileCheck %s --check-prefixes ALL,DEVICE
-
-! Check specification valid forms of declare target with functions 
-! utilising device_type and to clauses as well as the default 
-! zero clause declare target
-
-! DEVICE-LABEL: func.func @_QPfunc_t_device()
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}
-FUNCTION FUNC_T_DEVICE() RESULT(I)
-!$omp declare target to(FUNC_T_DEVICE) device_type(nohost)
-    INTEGER :: I
-    I = 1
-END FUNCTION FUNC_T_DEVICE
-
-! DEVICE-LABEL: func.func @_QPfunc_enter_device()
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}
-FUNCTION FUNC_ENTER_DEVICE() RESULT(I)
-!$omp declare target enter(FUNC_ENTER_DEVICE) device_type(nohost)
-    INTEGER :: I
-    I = 1
-END FUNCTION FUNC_ENTER_DEVICE
-
-! HOST-LABEL: func.func @_QPfunc_t_host()
-! HOST-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>{{.*}}
-FUNCTION FUNC_T_HOST() RESULT(I)
-!$omp declare target to(FUNC_T_HOST) device_type(host)
-    INTEGER :: I
-    I = 1
-END FUNCTION FUNC_T_HOST
-
-! HOST-LABEL: func.func @_QPfunc_enter_host()
-! HOST-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (enter)>{{.*}}
-FUNCTION FUNC_ENTER_HOST() RESULT(I)
-!$omp declare target enter(FUNC_ENTER_HOST) device_type(host)
-    INTEGER :: I
-    I = 1
-END FUNCTION FUNC_ENTER_HOST
-
-! ALL-LABEL: func.func @_QPfunc_t_any()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}
-FUNCTION FUNC_T_ANY() RESULT(I)
-!$omp declare target to(FUNC_T_ANY) device_type(any)
-    INTEGER :: I
-    I = 1
-END FUNCTION FUNC_T_ANY
-
-! ALL-LABEL: func.func @_QPfunc_enter_any()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}
-FUNCTION FUNC_ENTER_ANY() RESULT(I)
-!$omp declare target enter(FUNC_ENTER_ANY) device_type(any)
-    INTEGER :: I
-    I = 1
-END FUNCTION FUNC_ENTER_ANY
-
-! ALL-LABEL: func.func @_QPfunc_default_t_any()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}
-FUNCTION FUNC_DEFAULT_T_ANY() RESULT(I)
-!$omp declare target to(FUNC_DEFAULT_T_ANY)
-    INTEGER :: I
-    I = 1
-END FUNCTION FUNC_DEFAULT_T_ANY
-
-! ALL-LABEL: func.func @_QPfunc_default_enter_any()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}
-FUNCTION FUNC_DEFAULT_ENTER_ANY() RESULT(I)
-!$omp declare target enter(FUNC_DEFAULT_ENTER_ANY)
-    INTEGER :: I
-    I = 1
-END FUNCTION FUNC_DEFAULT_ENTER_ANY
-
-! ALL-LABEL: func.func @_QPfunc_default_any()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}
-FUNCTION FUNC_DEFAULT_ANY() RESULT(I)
-!$omp declare target
-    INTEGER :: I
-    I = 1
-END FUNCTION FUNC_DEFAULT_ANY
-
-! ALL-LABEL: func.func @_QPfunc_default_extendedlist()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}
-FUNCTION FUNC_DEFAULT_EXTENDEDLIST() RESULT(I)
-!$omp declare target(FUNC_DEFAULT_EXTENDEDLIST)
-    INTEGER :: I
-    I = 1
-END FUNCTION FUNC_DEFAULT_EXTENDEDLIST
-
-!! -----
-
-! Check specification valid forms of declare target with subroutines 
-! utilising device_type and to clauses as well as the default 
-! zero clause declare target
-
-! DEVICE-LABEL: func.func @_QPsubr_t_device()
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}
-SUBROUTINE SUBR_T_DEVICE()
-!$omp declare target to(SUBR_T_DEVICE) device_type(nohost)
-END
-
-! DEVICE-LABEL: func.func @_QPsubr_enter_device()
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}
-SUBROUTINE SUBR_ENTER_DEVICE()
-!$omp declare target enter(SUBR_ENTER_DEVICE) device_type(nohost)
-END
-
-! HOST-LABEL: func.func @_QPsubr_t_host()
-! HOST-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>{{.*}}
-SUBROUTINE SUBR_T_HOST()
-!$omp declare target to(SUBR_T_HOST) device_type(host)
-END
-
-! HOST-LABEL: func.func @_QPsubr_enter_host()
-! HOST-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (enter)>{{.*}}
-SUBROUTINE SUBR_ENTER_HOST()
-!$omp declare target enter(SUBR_ENTER_HOST) device_type(host)
-END
-
-! ALL-LABEL: func.func @_QPsubr_t_any()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}
-SUBROUTINE SUBR_T_ANY()
-!$omp declare target to(SUBR_T_ANY) device_type(any)
-END
-
-! ALL-LABEL: func.func @_QPsubr_enter_any()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}
-SUBROUTINE SUBR_ENTER_ANY()
-!$omp declare target enter(SUBR_ENTER_ANY) device_type(any)
-END
-
-! ALL-LABEL: func.func @_QPsubr_default_t_any()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}
-SUBROUTINE SUBR_DEFAULT_T_ANY()
-!$omp declare target to(SUBR_DEFAULT_T_ANY)
-END
-
-! ALL-LABEL: func.func @_QPsubr_default_enter_any()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}
-SUBROUTINE SUBR_DEFAULT_ENTER_ANY()
-!$omp declare target enter(SUBR_DEFAULT_ENTER_ANY)
-END
-
-! ALL-LABEL: func.func @_QPsubr_default_any()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}
-SUBROUTINE SUBR_DEFAULT_ANY()
-!$omp declare target
-END
-
-! ALL-LABEL: func.func @_QPsubr_default_extendedlist()
-! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}
-SUBROUTINE SUBR_DEFAULT_EXTENDEDLIST()
-!$omp declare target(SUBR_DEFAULT_EXTENDEDLIST)
-END
-
-!! -----
-
-! DEVICE-LABEL: func.func @_QPrecursive_declare_target
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}
-RECURSIVE FUNCTION RECURSIVE_DECLARE_TARGET(INCREMENT) RESULT(K)
-!$omp declare target to(RECURSIVE_DECLARE_TARGET) device_type(nohost)
-    INTEGER :: INCREMENT, K
-    IF (INCREMENT == 10) THEN
-        K = INCREMENT
-    ELSE
-        K = RECURSIVE_DECLARE_TARGET(INCREMENT + 1)
-    END IF
-END FUNCTION RECURSIVE_DECLARE_TARGET
-
-! DEVICE-LABEL: func.func @_QPrecursive_declare_target_enter
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}
-RECURSIVE FUNCTION RECURSIVE_DECLARE_TARGET_ENTER(INCREMENT) RESULT(K)
-!$omp declare target enter(RECURSIVE_DECLARE_TARGET_ENTER) device_type(nohost)
-    INTEGER :: INCREMENT, K
-    IF (INCREMENT == 10) THEN
-        K = INCREMENT
-    ELSE
-        K = RECURSIVE_DECLARE_TARGET_ENTER(INCREMENT + 1)
-    END IF
-END FUNCTION RECURSIVE_DECLARE_TARGET_ENTER
diff --git a/flang/test/Lower/OpenMP/FIR/declare-target-implicit-func-and-subr-cap-enter.f90 b/flang/test/Lower/OpenMP/FIR/declare-target-implicit-func-and-subr-cap-enter.f90
deleted file mode 100644
index 8e88d1b0f52a..000000000000
--- a/flang/test/Lower/OpenMP/FIR/declare-target-implicit-func-and-subr-cap-enter.f90
+++ /dev/null
@@ -1,192 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s  --check-prefix=DEVICE
-!RUN: bbc -emit-fir -fopenmp %s -o - | FileCheck %s
-!RUN: bbc -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DEVICE
-
-! CHECK-LABEL: func.func @_QPimplicitly_captured_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}}
-function implicitly_captured_twice() result(k)
-   integer :: i
-   i = 10
-   k = i
-end function implicitly_captured_twice
-
-! CHECK-LABEL: func.func @_QPtarget_function_twice_host
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (enter)>{{.*}}}
-function target_function_twice_host() result(i)
-!$omp declare target enter(target_function_twice_host) device_type(host)
-   integer :: i
-   i = implicitly_captured_twice()
-end function target_function_twice_host
-
-! DEVICE-LABEL: func.func @_QPtarget_function_twice_device
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
-function target_function_twice_device() result(i)
-!$omp declare target enter(target_function_twice_device) device_type(nohost)
-   integer :: i
-   i = implicitly_captured_twice()
-end function target_function_twice_device
-
-!! -----
-
-! DEVICE-LABEL: func.func @_QPimplicitly_captured_nest
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
-function implicitly_captured_nest() result(k)
-   integer :: i
-   i = 10
-   k = i
-end function implicitly_captured_nest
-
-! DEVICE-LABEL: func.func @_QPimplicitly_captured_one
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter){{.*}}}
-function implicitly_captured_one() result(k)
-   k = implicitly_captured_nest()
-end function implicitly_captured_one
-
-! DEVICE-LABEL: func.func @_QPimplicitly_captured_two
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
-function implicitly_captured_two() result(k)
-   integer :: i
-   i = 10
-   k = i
-end function implicitly_captured_two
-
-! DEVICE-LABEL: func.func @_QPtarget_function_test
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
-function target_function_test() result(j)
-!$omp declare target enter(target_function_test) device_type(nohost)
-   integer :: i, j
-   i = implicitly_captured_one()
-   j = implicitly_captured_two() + i
-end function target_function_test
-
-!! -----
-
-! CHECK-LABEL: func.func @_QPimplicitly_captured_nest_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}}
-function implicitly_captured_nest_twice() result(k)
-   integer :: i
-   i = 10
-   k = i
-end function implicitly_captured_nest_twice
-
-! CHECK-LABEL: func.func @_QPimplicitly_captured_one_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}}
-function implicitly_captured_one_twice() result(k)
-   k = implicitly_captured_nest_twice()
-end function implicitly_captured_one_twice
-
-! CHECK-LABEL: func.func @_QPimplicitly_captured_two_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}}
-function implicitly_captured_two_twice() result(k)
-   integer :: i
-   i = 10
-   k = i
-end function implicitly_captured_two_twice
-
-! DEVICE-LABEL: func.func @_QPtarget_function_test_device
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
-function target_function_test_device() result(j)
-   !$omp declare target enter(target_function_test_device) device_type(nohost)
-   integer :: i, j
-   i = implicitly_captured_one_twice()
-   j = implicitly_captured_two_twice() + i
-end function target_function_test_device
-
-! CHECK-LABEL: func.func @_QPtarget_function_test_host
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (enter)>{{.*}}}
-function target_function_test_host() result(j)
-   !$omp declare target enter(target_function_test_host) device_type(host)
-   integer :: i, j
-   i = implicitly_captured_one_twice()
-   j = implicitly_captured_two_twice() + i
-end function target_function_test_host
-
-!! -----
-
-! DEVICE-LABEL: func.func @_QPimplicitly_captured_with_dev_type_recursive
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}}
-recursive function implicitly_captured_with_dev_type_recursive(increment) result(k)
-!$omp declare target enter(implicitly_captured_with_dev_type_recursive) device_type(host)
-   integer :: increment, k
-   if (increment == 10) then
-      k = increment
-   else
-      k = implicitly_captured_with_dev_type_recursive(increment + 1)
-   end if
-end function implicitly_captured_with_dev_type_recursive
-
-! DEVICE-LABEL: func.func @_QPtarget_function_with_dev_type_recurse
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
-function target_function_with_dev_type_recurse() result(i)
-!$omp declare target enter(target_function_with_dev_type_recurse) device_type(nohost)
-   integer :: i
-   i = implicitly_captured_with_dev_type_recursive(0)
-end function target_function_with_dev_type_recurse
-
-!! -----
-
-module test_module
-contains
-! CHECK-LABEL: func.func @_QMtest_modulePimplicitly_captured_nest_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}}
-   function implicitly_captured_nest_twice() result(i)
-      integer :: i
-      i = 10
-   end function implicitly_captured_nest_twice
-
-! CHECK-LABEL: func.func @_QMtest_modulePimplicitly_captured_one_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>{{.*}}}
-   function implicitly_captured_one_twice() result(k)
-      !$omp declare target enter(implicitly_captured_one_twice) device_type(host)
-      k = implicitly_captured_nest_twice()
-   end function implicitly_captured_one_twice
-
-! DEVICE-LABEL: func.func @_QMtest_modulePimplicitly_captured_two_twice
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
-   function implicitly_captured_two_twice() result(y)
-      integer :: y
-      y = 5
-   end function implicitly_captured_two_twice
-
-! DEVICE-LABEL: func.func @_QMtest_modulePtarget_function_test_device
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
-   function target_function_test_device() result(j)
-      !$omp declare target enter(target_function_test_device) device_type(nohost)
-      integer :: i, j
-      i = implicitly_captured_one_twice()
-      j = implicitly_captured_two_twice() + i
-   end function target_function_test_device
-end module test_module
-
-!! -----
-
-program mb
-   interface
-      subroutine caller_recursive
-         !$omp declare target enter(caller_recursive) device_type(nohost)
-      end subroutine
-
-      recursive subroutine implicitly_captured_recursive(increment)
-         integer :: increment
-      end subroutine
-   end interface
-end program
-
-! DEVICE-LABEL: func.func @_QPimplicitly_captured_recursive
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
-recursive subroutine implicitly_captured_recursive(increment)
-   integer :: increment
-   if (increment == 10) then
-      return
-   else
-      call implicitly_captured_recursive(increment + 1)
-   end if
-end subroutine
-
-! DEVICE-LABEL: func.func @_QPcaller_recursive
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (enter)>{{.*}}}
-subroutine caller_recursive
-!$omp declare target enter(caller_recursive) device_type(nohost)
-   call implicitly_captured_recursive(0)
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/declare-target-implicit-func-and-subr-cap.f90 b/flang/test/Lower/OpenMP/FIR/declare-target-implicit-func-and-subr-cap.f90
deleted file mode 100644
index a90b04246e6d..000000000000
--- a/flang/test/Lower/OpenMP/FIR/declare-target-implicit-func-and-subr-cap.f90
+++ /dev/null
@@ -1,218 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s  --check-prefix=DEVICE
-!RUN: bbc -emit-fir -fopenmp %s -o - | FileCheck %s
-!RUN: bbc -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DEVICE
-
-! CHECK-LABEL: func.func @_QPimplicitly_captured
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}}
-function implicitly_captured(toggle) result(k)
-   integer :: i, j, k
-   logical :: toggle
-   i = 10
-   j = 5
-   if (toggle) then
-      k = i
-   else
-      k = j
-   end if
-end function implicitly_captured
-
-
-! CHECK-LABEL: func.func @_QPtarget_function
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}}
-function target_function(toggle) result(i)
-!$omp declare target
-   integer :: i
-   logical :: toggle
-   i = implicitly_captured(toggle)
-end function target_function
-
-!! -----
-
-! CHECK-LABEL: func.func @_QPimplicitly_captured_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}}
-function implicitly_captured_twice() result(k)
-   integer :: i
-   i = 10
-   k = i
-end function implicitly_captured_twice
-
-! CHECK-LABEL: func.func @_QPtarget_function_twice_host
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>{{.*}}}
-function target_function_twice_host() result(i)
-!$omp declare target to(target_function_twice_host) device_type(host)
-   integer :: i
-   i = implicitly_captured_twice()
-end function target_function_twice_host
-
-! DEVICE-LABEL: func.func @_QPtarget_function_twice_device
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
-function target_function_twice_device() result(i)
-!$omp declare target to(target_function_twice_device) device_type(nohost)
-   integer :: i
-   i = implicitly_captured_twice()
-end function target_function_twice_device
-
-!! -----
-
-! DEVICE-LABEL: func.func @_QPimplicitly_captured_nest
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
-function implicitly_captured_nest() result(k)
-   integer :: i
-   i = 10
-   k = i
-end function implicitly_captured_nest
-
-! DEVICE-LABEL: func.func @_QPimplicitly_captured_one
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to){{.*}}}
-function implicitly_captured_one() result(k)
-   k = implicitly_captured_nest()
-end function implicitly_captured_one
-
-! DEVICE-LABEL: func.func @_QPimplicitly_captured_two
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
-function implicitly_captured_two() result(k)
-   integer :: i
-   i = 10
-   k = i
-end function implicitly_captured_two
-
-! DEVICE-LABEL: func.func @_QPtarget_function_test
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
-function target_function_test() result(j)
-!$omp declare target to(target_function_test) device_type(nohost)
-   integer :: i, j
-   i = implicitly_captured_one()
-   j = implicitly_captured_two() + i
-end function target_function_test
-
-!! -----
-
-! CHECK-LABEL: func.func @_QPimplicitly_captured_nest_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}}
-function implicitly_captured_nest_twice() result(k)
-   integer :: i
-   i = 10
-   k = i
-end function implicitly_captured_nest_twice
-
-! CHECK-LABEL: func.func @_QPimplicitly_captured_one_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}}
-function implicitly_captured_one_twice() result(k)
-   k = implicitly_captured_nest_twice()
-end function implicitly_captured_one_twice
-
-! CHECK-LABEL: func.func @_QPimplicitly_captured_two_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}}
-function implicitly_captured_two_twice() result(k)
-   integer :: i
-   i = 10
-   k = i
-end function implicitly_captured_two_twice
-
-! DEVICE-LABEL: func.func @_QPtarget_function_test_device
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
-function target_function_test_device() result(j)
-   !$omp declare target to(target_function_test_device) device_type(nohost)
-   integer :: i, j
-   i = implicitly_captured_one_twice()
-   j = implicitly_captured_two_twice() + i
-end function target_function_test_device
-
-! CHECK-LABEL: func.func @_QPtarget_function_test_host
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>{{.*}}}
-function target_function_test_host() result(j)
-   !$omp declare target to(target_function_test_host) device_type(host)
-   integer :: i, j
-   i = implicitly_captured_one_twice()
-   j = implicitly_captured_two_twice() + i
-end function target_function_test_host
-
-!! -----
-
-! DEVICE-LABEL: func.func @_QPimplicitly_captured_with_dev_type_recursive
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}}
-recursive function implicitly_captured_with_dev_type_recursive(increment) result(k)
-!$omp declare target to(implicitly_captured_with_dev_type_recursive) device_type(host)
-   integer :: increment, k
-   if (increment == 10) then
-      k = increment
-   else
-      k = implicitly_captured_with_dev_type_recursive(increment + 1)
-   end if
-end function implicitly_captured_with_dev_type_recursive
-
-! DEVICE-LABEL: func.func @_QPtarget_function_with_dev_type_recurse
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
-function target_function_with_dev_type_recurse() result(i)
-!$omp declare target to(target_function_with_dev_type_recurse) device_type(nohost)
-   integer :: i
-   i = implicitly_captured_with_dev_type_recursive(0)
-end function target_function_with_dev_type_recurse
-
-!! -----
-
-module test_module
-contains
-! CHECK-LABEL: func.func @_QMtest_modulePimplicitly_captured_nest_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}}
-   function implicitly_captured_nest_twice() result(i)
-      integer :: i
-      i = 10
-   end function implicitly_captured_nest_twice
-
-! CHECK-LABEL: func.func @_QMtest_modulePimplicitly_captured_one_twice
-! CHECK-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>{{.*}}}
-   function implicitly_captured_one_twice() result(k)
-      !$omp declare target to(implicitly_captured_one_twice) device_type(host)
-      k = implicitly_captured_nest_twice()
-   end function implicitly_captured_one_twice
-
-! DEVICE-LABEL: func.func @_QMtest_modulePimplicitly_captured_two_twice
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
-   function implicitly_captured_two_twice() result(y)
-      integer :: y
-      y = 5
-   end function implicitly_captured_two_twice
-
-! DEVICE-LABEL: func.func @_QMtest_modulePtarget_function_test_device
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
-   function target_function_test_device() result(j)
-      !$omp declare target to(target_function_test_device) device_type(nohost)
-      integer :: i, j
-      i = implicitly_captured_one_twice()
-      j = implicitly_captured_two_twice() + i
-   end function target_function_test_device
-end module test_module
-
-!! -----
-
-program mb
-   interface
-      subroutine caller_recursive
-         !$omp declare target to(caller_recursive) device_type(nohost)
-      end subroutine
-
-      recursive subroutine implicitly_captured_recursive(increment)
-         integer :: increment
-      end subroutine
-   end interface
-end program
-
-! DEVICE-LABEL: func.func @_QPimplicitly_captured_recursive
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
-recursive subroutine implicitly_captured_recursive(increment)
-   integer :: increment
-   if (increment == 10) then
-      return
-   else
-      call implicitly_captured_recursive(increment + 1)
-   end if
-end subroutine
-
-! DEVICE-LABEL: func.func @_QPcaller_recursive
-! DEVICE-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>{{.*}}}
-subroutine caller_recursive
-!$omp declare target to(caller_recursive) device_type(nohost)
-   call implicitly_captured_recursive(0)
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/default-clause.f90 b/flang/test/Lower/OpenMP/FIR/default-clause.f90
deleted file mode 100644
index 14c0d375896a..000000000000
--- a/flang/test/Lower/OpenMP/FIR/default-clause.f90
+++ /dev/null
@@ -1,281 +0,0 @@
-! This test checks lowering of OpenMP parallel directive
-! with `DEFAULT` clause present.
-
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-
-!CHECK: func @_QQmain() attributes {fir.bindc_name = "default_clause_lowering"} {
-!CHECK: %[[W:.*]] = fir.alloca i32 {bindc_name = "w", uniq_name = "_QFEw"}
-!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
-!CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
-!CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFEz"}
-!CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
-!CHECK: %[[const:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: fir.store %[[const]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
-!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFEw"}
-!CHECK: %[[const:.*]] = arith.constant 2 : i32
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: %[[result:.*]] = arith.muli %[[const]], %[[temp]] : i32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_W]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 45 : i32
-!CHECK: %[[result:.*]] = arith.addi %[[temp]], %[[const]] : i32
-!CHECK: fir.store %[[result]] to %[[Z]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-
-program default_clause_lowering
-    integer :: x, y, z, w
-
-    !$omp parallel default(private) firstprivate(x) shared(z)
-        x = y * 2
-        z = w + 45
-    !$omp end parallel
-
-!CHECK: omp.parallel {
-!CHECK: %[[temp:.*]] = fir.load %[[Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-
-    !$omp parallel default(shared)
-        x = y
-    !$omp end parallel
-
-!CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-
-    !$omp parallel default(none) private(x, y)
-        x = y
-    !$omp end parallel
-
-!CHECK: omp.parallel   {
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
-!CHECK: %[[temp:.*]] = fir.load %[[Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-
-    !$omp parallel default(firstprivate) firstprivate(y)
-        x = y
-    !$omp end parallel
-
-!CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
-!CHECK: %[[temp:.*]] = fir.load %[[Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFEw"}
-!CHECK: %[[temp:.*]] = fir.load %[[W]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_W]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 2 : i32
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: %[[result:.*]] = arith.muli %[[const]], %[[temp]] : i32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_W]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 45 : i32
-!CHECK: %[[result:.*]] = arith.addi %[[temp]], %[[const]] : i32
-!CHECK: fir.store %[[result]] to %[[Z]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-
-    !$omp parallel default(firstprivate) private(x) shared(z)
-        x = y * 2
-        z = w + 45
-    !$omp end parallel
-
-!CHECK: omp.parallel   {
-!CHECK: omp.parallel   {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFEw"}
-!CHECK: %[[temp:.*]] = fir.load %[[W]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_W]] : !fir.ref<i32>
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_W]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-    !$omp parallel
-        !$omp parallel default(private)
-            x = y
-        !$omp end parallel
-
-        !$omp parallel default(firstprivate)
-            w = x
-        !$omp end parallel
-    !$omp end parallel
-
-end program default_clause_lowering
-
-subroutine nested_default_clause_tests
-    integer :: x, y, z, w, k, a
-    
-!CHECK: %[[K:.*]] = fir.alloca i32 {bindc_name = "k", uniq_name = "_QFnested_default_clause_testsEk"}
-!CHECK: %[[W:.*]] = fir.alloca i32 {bindc_name = "w", uniq_name = "_QFnested_default_clause_testsEw"}
-!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFnested_default_clause_testsEz"}
-!CHECK: omp.parallel   {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
-!CHECK: %[[PRIVATE_K:.*]] = fir.alloca i32 {bindc_name = "k", pinned, uniq_name = "_QFnested_default_clause_testsEk"}
-!CHECK: omp.parallel {
-!CHECK: %[[INNER_PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[INNER_PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[const:.*]] = arith.constant 20 : i32
-!CHECK: fir.store %[[const]] to %[[INNER_PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 10 : i32
-!CHECK: fir.store %[[const]] to %[[INNER_PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.parallel   {
-!CHECK: %[[INNER_PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"}
-!CHECK: %[[INNER_PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_Z]]
-!CHECK: fir.store %[[temp]] to %[[INNER_PRIVATE_Z]] : !fir.ref<i32>
-!CHECK: %[[INNER_PRIVATE_K:.*]] = fir.alloca i32 {bindc_name = "k", pinned, uniq_name = "_QFnested_default_clause_testsEk"}
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_K]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[INNER_PRIVATE_K]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 30 : i32
-!CHECK: fir.store %[[const]] to %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 40 : i32
-!CHECK: fir.store %[[const]] to %[[INNER_PRIVATE_W]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 50 : i32
-!CHECK: fir.store %[[const]] to %[[INNER_PRIVATE_Z]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 40 : i32
-!CHECK: fir.store %[[const]] to %[[INNER_PRIVATE_K]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-    !$omp parallel  firstprivate(x) private(y) shared(w) default(private)  
-        !$omp parallel default(private)
-           y = 20
-           x = 10 
-        !$omp end parallel 
-
-        !$omp parallel default(firstprivate) shared(y) private(w) 
-            y = 30
-            w = 40 
-            z = 50
-            k = 40
-        !$omp end parallel
-    !$omp end parallel
-    
-    
-!CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
-!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"} 
-!CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_INNER_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_INNER_X]] : !fir.ref<i32>
-!CHECK: %[[INNER_PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[INNER_PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: %[[temp:.*]] = fir.load %[[INNER_PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_INNER_X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_INNER_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"} 
-!CHECK: %[[PRIVATE_INNER_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[temp_1:.*]] = fir.load %[[PRIVATE_INNER_X]] : !fir.ref<i32>
-!CHECK: %[[temp_2:.*]] = fir.load %[[PRIVATE_Z]] : !fir.ref<i32>
-!CHECK: %[[result:.*]] = arith.addi %{{.*}}, %{{.*}} : i32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_INNER_W]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-    !$omp parallel default(private)
-        !$omp parallel default(firstprivate)
-            x = y
-        !$omp end parallel
-
-        !$omp parallel default(private) shared(z)
-            w = x + z
-        !$omp end parallel
-    !$omp end parallel    
-    
-!CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"}
-!CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
-!CHECK: omp.parallel {
-!CHECK: %[[INNER_PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[INNER_PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[INNER_PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[INNER_PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: %[[temp:.*]] = fir.load %[[INNER_PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[INNER_PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.parallel {
-!CHECK: %[[temp_1:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[temp_2:.*]] = fir.load %[[PRIVATE_Z]] : !fir.ref<i32>
-!CHECK: %[[temp_3:.*]] = arith.addi %[[temp_1]], %[[temp_2]] : i32
-!CHECK: fir.store %[[temp_3]] to %[[PRIVATE_W]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: }
-    !$omp parallel default(private)
-		!$omp parallel default(firstprivate)
-			x = y
-		!$omp end parallel
-
-		!$omp parallel default(shared)
-			w = x + z
-		!$omp end parallel
-	!$omp end parallel
-
-!CHECK: omp.parallel {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
-!CHECK: %[[temp:.*]] = fir.load %[[Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: omp.single {
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_Y]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: } 
-	!$omp parallel default(firstprivate)
-		!$omp single
-			x = y
-		!$omp end single
-	!$omp end parallel
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/delayed-privatization-firstprivate.f90 b/flang/test/Lower/OpenMP/FIR/delayed-privatization-firstprivate.f90
deleted file mode 100644
index 50938342dee7..000000000000
--- a/flang/test/Lower/OpenMP/FIR/delayed-privatization-firstprivate.f90
+++ /dev/null
@@ -1,32 +0,0 @@
-! Test delayed privatization for the `private` clause.
-
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir \
-! RUN:   --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --openmp-enable-delayed-privatization \
-! RUN:   -o - %s 2>&1 | FileCheck %s
-
-subroutine delayed_privatization_firstprivate
-  implicit none
-  integer :: var1
-
-!$OMP PARALLEL FIRSTPRIVATE(var1)
-  var1 = 10
-!$OMP END PARALLEL
-end subroutine
-
-! CHECK-LABEL: omp.private {type = firstprivate}
-! CHECK-SAME: @[[VAR1_PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
-! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<i32>):
-! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_firstprivateEvar1"}
-! CHECK-NEXT:   omp.yield(%[[PRIV_ALLOC]] : !fir.ref<i32>)
-! CHECK: } copy {
-! CHECK: ^bb0(%[[PRIV_ORIG_ARG:.*]]: !fir.ref<i32>, %[[PRIV_PRIV_ARG:.*]]: !fir.ref<i32>):
-! CHECK:    %[[ORIG_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]] : !fir.ref<i32>
-! CHECK:    fir.store %[[ORIG_VAL]] to %[[PRIV_PRIV_ARG]] : !fir.ref<i32>
-! CHECK:    omp.yield(%[[PRIV_PRIV_ARG]] : !fir.ref<i32>)
-! CHECK: }
-
-! CHECK-LABEL: @_QPdelayed_privatization_firstprivate
-! CHECK: omp.parallel private(@[[VAR1_PRIVATIZER_SYM]] %{{.*}} -> %{{.*}} : !fir.ref<i32>) {
-! CHECK: omp.terminator
-
diff --git a/flang/test/Lower/OpenMP/FIR/delayed-privatization-private.f90 b/flang/test/Lower/OpenMP/FIR/delayed-privatization-private.f90
deleted file mode 100644
index b13687faa3f2..000000000000
--- a/flang/test/Lower/OpenMP/FIR/delayed-privatization-private.f90
+++ /dev/null
@@ -1,41 +0,0 @@
-! Test delayed privatization for the `private` clause.
-
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir \
-! RUN:   --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --openmp-enable-delayed-privatization \
-! RUN:   -o - %s 2>&1 | FileCheck %s
-
-subroutine delayed_privatization_private
-  implicit none
-  integer :: var1
-
-!$OMP PARALLEL PRIVATE(var1)
-  var1 = 10
-!$OMP END PARALLEL
-
-!$OMP PARALLEL PRIVATE(var1)
-  var1 = 20
-!$OMP END PARALLEL
-
-end subroutine
-
-! CHECK-LABEL: omp.private {type = private}
-! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
-! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<i32>):
-! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_privateEvar1"}
-! CHECK-NEXT:   omp.yield(%[[PRIV_ALLOC]] : !fir.ref<i32>)
-! CHECK-NOT: } copy {
-
-! CHECK-LABEL: @_QPdelayed_privatization_private
-! CHECK: %[[ORIG_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", uniq_name = "_QFdelayed_privatization_privateEvar1"}
-! CHECK: omp.parallel private(@[[PRIVATIZER_SYM]] %[[ORIG_ALLOC]] -> %[[PAR_ARG:.*]] : !fir.ref<i32>) {
-! CHECK: %[[C10:.*]] = arith.constant 10 : i32
-! CHECK: fir.store %[[C10]] to %[[PAR_ARG]] : !fir.ref<i32>
-! CHECK: omp.terminator
-
-! Test that the same privatizer is used if the a variable with the same type and
-! name was previously privatized.
-! CHECK: omp.parallel private(@[[PRIVATIZER_SYM]] %[[ORIG_ALLOC]] -> %[[PAR_ARG:.*]] : !fir.ref<i32>) {
-! CHECK: %[[C20:.*]] = arith.constant 20 : i32
-! CHECK: fir.store %[[C20]] to %[[PAR_ARG]] : !fir.ref<i32>
-! CHECK: omp.terminator
diff --git a/flang/test/Lower/OpenMP/FIR/firstprivate-commonblock.f90 b/flang/test/Lower/OpenMP/FIR/firstprivate-commonblock.f90
deleted file mode 100644
index 6adc7d9f6c82..000000000000
--- a/flang/test/Lower/OpenMP/FIR/firstprivate-commonblock.f90
+++ /dev/null
@@ -1,30 +0,0 @@
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-
-!CHECK: func.func @_QPfirstprivate_common() {
-!CHECK: %[[val_0:.*]] = fir.address_of(@c_) : !fir.ref<!fir.array<8xi8>>
-!CHECK: %[[val_1:.*]] = fir.convert %[[val_0]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c0:.*]] = arith.constant 0 : index
-!CHECK: %[[val_2:.*]] = fir.coordinate_of %[[val_1]], %[[val_c0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_3:.*]] = fir.convert %[[val_2]] : (!fir.ref<i8>) -> !fir.ref<f32>
-!CHECK: %[[val_4:.*]] = fir.convert %[[val_0]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c4:.*]] = arith.constant 4 : index
-!CHECK: %[[val_5:.*]] = fir.coordinate_of %[[val_4]], %[[val_c4]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_6:.*]] = fir.convert %[[val_5]] : (!fir.ref<i8>) -> !fir.ref<f32>
-!CHECK: omp.parallel {
-!CHECK: %[[val_7:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFfirstprivate_commonEx"}
-!CHECK: %[[val_8:.*]] = fir.load %[[val_3]] : !fir.ref<f32>
-!CHECK: fir.store %[[val_8]] to %[[val_7]] : !fir.ref<f32>
-!CHECK: %[[val_9:.*]] = fir.alloca f32 {bindc_name = "y", pinned, uniq_name = "_QFfirstprivate_commonEy"}
-!CHECK: %[[val_10:.*]] = fir.load %[[val_6]] : !fir.ref<f32>
-!CHECK: fir.store %[[val_10]] to %[[val_9]] : !fir.ref<f32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: return
-!CHECK: }
-
-subroutine firstprivate_common
-  common /c/ x, y
-  real x, y
-  !$omp parallel firstprivate(/c/)
-  !$omp end parallel
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/flush.f90 b/flang/test/Lower/OpenMP/FIR/flush.f90
deleted file mode 100644
index 2c281632b85c..000000000000
--- a/flang/test/Lower/OpenMP/FIR/flush.f90
+++ /dev/null
@@ -1,45 +0,0 @@
-! This test checks lowering of OpenMP Flush Directive.
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes="FIRDialect,OMPDialect"
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --cfg-conversion | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefixes="LLVMIRDialect,OMPDialect"
-
-subroutine flush_standalone(a, b, c)
-    integer, intent(inout) :: a, b, c
-
-!$omp flush(a,b,c)
-!$omp flush
-!OMPDialect: omp.flush(%{{.*}}, %{{.*}}, %{{.*}} :
-!FIRDialect: !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>)
-!LLVMIRDialect: !llvm.ptr, !llvm.ptr, !llvm.ptr)
-!OMPDialect: omp.flush
-
-end subroutine flush_standalone
-
-subroutine flush_parallel(a, b, c)
-    integer, intent(inout) :: a, b, c
-
-!$omp parallel
-!OMPDialect:  omp.parallel {
-
-!OMPDialect: omp.flush(%{{.*}}, %{{.*}}, %{{.*}} :
-!FIRDialect: !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>)
-!LLVMIRDialect: !llvm.ptr, !llvm.ptr, !llvm.ptr)
-!OMPDialect: omp.flush
-!$omp flush(a,b,c)
-!$omp flush
-
-!FIRDialect: %{{.*}} = fir.load %{{.*}} : !fir.ref<i32>
-!FIRDialect: %{{.*}} = fir.load %{{.*}} : !fir.ref<i32>
-!FIRDialect: %{{.*}} = arith.addi %{{.*}}, %{{.*}} : i32
-!FIRDialect: fir.store %{{.*}} to %{{.*}} : !fir.ref<i32>
-
-!LLVMIRDialect: %{{.*}} = llvm.load %{{.*}} : !llvm.ptr -> i32
-!LLVMIRDialect: %{{.*}} = llvm.load %{{.*}} : !llvm.ptr -> i32
-!LLVMIRDialect: %{{.*}} = llvm.add %{{.*}}, %{{.*}} : i32
-!LLVMIRDialect: llvm.store %{{.*}}, %{{.*}} : i32, !llvm.ptr
-    c = a + b
-
-!OMPDialect: omp.terminator
-!$omp END parallel
-
-end subroutine flush_parallel
diff --git a/flang/test/Lower/OpenMP/FIR/if-clause.f90 b/flang/test/Lower/OpenMP/FIR/if-clause.f90
deleted file mode 100644
index 683d9f7ef972..000000000000
--- a/flang/test/Lower/OpenMP/FIR/if-clause.f90
+++ /dev/null
@@ -1,498 +0,0 @@
-! This test checks lowering of OpenMP IF clauses.
-
-! The "if" clause was added to the "simd" directive in OpenMP 5.0, and
-! to the "teams" directive in OpenMP 5.2.
-! RUN: bbc -fopenmp -fopenmp-version=52 -emit-fir %s -o - | FileCheck %s
-! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -emit-fir %s -o - | FileCheck %s
-
-program main
-  integer :: i
-
-  ! TODO When they are supported, add tests for:
-  ! - DISTRIBUTE PARALLEL DO
-  ! - DISTRIBUTE PARALLEL DO SIMD
-  ! - DISTRIBUTE SIMD
-  ! - PARALLEL SECTIONS
-  ! - PARALLEL WORKSHARE
-  ! - TARGET PARALLEL
-  ! - TARGET TEAMS DISTRIBUTE
-  ! - TARGET TEAMS DISTRIBUTE PARALLEL DO
-  ! - TARGET TEAMS DISTRIBUTE PARALLEL DO SIMD
-  ! - TARGET TEAMS DISTRIBUTE SIMD
-  ! - TARGET UPDATE
-  ! - TASKLOOP
-  ! - TASKLOOP SIMD
-  ! - TEAMS DISTRIBUTE
-  ! - TEAMS DISTRIBUTE PARALLEL DO
-  ! - TEAMS DISTRIBUTE PARALLEL DO SIMD
-  ! - TEAMS DISTRIBUTE SIMD
-
-  ! ----------------------------------------------------------------------------
-  ! DO SIMD
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.wsloop
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp do simd
-  do i = 1, 10
-  end do
-  !$omp end do simd
-
-  ! CHECK:      omp.wsloop
-  !$omp do simd if(.true.)
-  do i = 1, 10
-  end do
-  !$omp end do simd
-
-  ! CHECK:      omp.wsloop
-  !$omp do simd if(simd: .true.)
-  do i = 1, 10
-  end do
-  !$omp end do simd
-
-  ! ----------------------------------------------------------------------------
-  ! PARALLEL
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.parallel
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp parallel
-  i = 10
-  !$omp end parallel
-
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  !$omp parallel if(.true.)
-  i = 10
-  !$omp end parallel
-
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  !$omp parallel if(parallel: .true.)
-  i = 10
-  !$omp end parallel
-
-  ! ----------------------------------------------------------------------------
-  ! PARALLEL DO
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.parallel
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp parallel do
-  do i = 1, 10
-  end do
-  !$omp end parallel do
-
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  !$omp parallel do if(.true.)
-  do i = 1, 10
-  end do
-  !$omp end parallel do
-
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  !$omp parallel do if(parallel: .true.)
-  do i = 1, 10
-  end do
-  !$omp end parallel do
-
-  ! ----------------------------------------------------------------------------
-  ! PARALLEL DO SIMD
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.parallel
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.wsloop
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp parallel do simd
-  do i = 1, 10
-  end do
-  !$omp end parallel do simd
-
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.wsloop
-  !$omp parallel do simd if(.true.)
-  do i = 1, 10
-  end do
-  !$omp end parallel do simd
-
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.wsloop
-  !$omp parallel do simd if(parallel: .true.) if(simd: .false.)
-  do i = 1, 10
-  end do
-  !$omp end parallel do simd
-
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.wsloop
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp parallel do simd if(parallel: .true.)
-  do i = 1, 10
-  end do
-  !$omp end parallel do simd
-
-  ! CHECK:      omp.parallel
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.wsloop
-  !$omp parallel do simd if(simd: .true.)
-  do i = 1, 10
-  end do
-  !$omp end parallel do simd
-
-  ! ----------------------------------------------------------------------------
-  ! SIMD
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.simd
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp simd
-  do i = 1, 10
-  end do
-  !$omp end simd
-
-  ! CHECK:      omp.simd
-  ! CHECK-SAME: if({{.*}})
-  !$omp simd if(.true.)
-  do i = 1, 10
-  end do
-  !$omp end simd
-
-  ! CHECK:      omp.simd
-  ! CHECK-SAME: if({{.*}})
-  !$omp simd if(simd: .true.)
-  do i = 1, 10
-  end do
-  !$omp end simd
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.target
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp target
-  !$omp end target
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  !$omp target if(.true.)
-  !$omp end target
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  !$omp target if(target: .true.)
-  !$omp end target
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET DATA
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.target_data
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp target data map(tofrom: i)
-  !$omp end target data
-
-  ! CHECK:      omp.target_data
-  ! CHECK-SAME: if({{.*}})
-  !$omp target data map(tofrom: i) if(.true.)
-  !$omp end target data
-
-  ! CHECK:      omp.target_data
-  ! CHECK-SAME: if({{.*}})
-  !$omp target data map(tofrom: i) if(target data: .true.)
-  !$omp end target data
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET ENTER DATA
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.target_enter_data
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: map
-  !$omp target enter data map(to: i)
-
-  ! CHECK:      omp.target_enter_data
-  ! CHECK-SAME: if({{.*}})
-  !$omp target enter data map(to: i) if(.true.)
-
-  ! CHECK:      omp.target_enter_data
-  ! CHECK-SAME: if({{.*}})
-  !$omp target enter data map(to: i) if(target enter data: .true.)
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET EXIT DATA
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.target_exit_data
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: map
-  !$omp target exit data map(from: i)
-
-  ! CHECK:      omp.target_exit_data
-  ! CHECK-SAME: if({{.*}})
-  !$omp target exit data map(from: i) if(.true.)
-
-  ! CHECK:      omp.target_exit_data
-  ! CHECK-SAME: if({{.*}})
-  !$omp target exit data map(from: i) if(target exit data: .true.)
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET PARALLEL DO
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.target
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp target parallel do
-  do i = 1, 10
-  end do
-  !$omp end target parallel do
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  !$omp target parallel do if(.true.)
-  do i = 1, 10
-  end do
-  !$omp end target parallel do
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  !$omp target parallel do if(target: .true.) if(parallel: .false.)
-  do i = 1, 10
-  end do
-  !$omp end target parallel do
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.parallel
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp target parallel do if(target: .true.)
-  do i = 1, 10
-  end do
-  !$omp end target parallel do
-
-  ! CHECK:      omp.target
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  !$omp target parallel do if(parallel: .true.)
-  do i = 1, 10
-  end do
-  !$omp end target parallel do
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET PARALLEL DO SIMD
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.target
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.wsloop
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp target parallel do simd
-  do i = 1, 10
-  end do
-  !$omp end target parallel do simd
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.wsloop
-  !$omp target parallel do simd if(.true.)
-  do i = 1, 10
-  end do
-  !$omp end target parallel do simd
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.wsloop
-  !$omp target parallel do simd if(target: .true.) if(parallel: .false.) &
-  !$omp&                        if(simd: .true.)
-  do i = 1, 10
-  end do
-  !$omp end target parallel do simd
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.parallel
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.wsloop
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp target parallel do simd if(target: .true.)
-  do i = 1, 10
-  end do
-  !$omp end target parallel do simd
-
-  ! CHECK:      omp.target
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.wsloop
-  !$omp target parallel do simd if(parallel: .true.) if(simd: .false.)
-  do i = 1, 10
-  end do
-  !$omp end target parallel do simd
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET SIMD
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.target
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.simd
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp target simd
-  do i = 1, 10
-  end do
-  !$omp end target simd
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.simd
-  ! CHECK-SAME: if({{.*}})
-  !$omp target simd if(.true.)
-  do i = 1, 10
-  end do
-  !$omp end target simd
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.simd
-  ! CHECK-SAME: if({{.*}})
-  !$omp target simd if(target: .true.) if(simd: .false.)
-  do i = 1, 10
-  end do
-  !$omp end target simd
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.simd
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp target simd if(target: .true.)
-  do i = 1, 10
-  end do
-  !$omp end target simd
-
-  ! CHECK:      omp.target
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.simd
-  ! CHECK-SAME: if({{.*}})
-  !$omp target simd if(simd: .true.)
-  do i = 1, 10
-  end do
-  !$omp end target simd
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET TEAMS
-  ! ----------------------------------------------------------------------------
-
-  ! CHECK:      omp.target
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.teams
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp target teams
-  i = 1
-  !$omp end target teams
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.teams
-  ! CHECK-SAME: if({{.*}})
-  !$omp target teams if(.true.)
-  i = 1
-  !$omp end target teams
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.teams
-  ! CHECK-SAME: if({{.*}})
-  !$omp target teams if(target: .true.) if(teams: .false.)
-  i = 1
-  !$omp end target teams
-
-  ! CHECK:      omp.target
-  ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.teams
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp target teams if(target: .true.)
-  i = 1
-  !$omp end target teams
-
-  ! CHECK:      omp.target
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  ! CHECK:      omp.teams
-  ! CHECK-SAME: if({{.*}})
-  !$omp target teams if(teams: .true.)
-  i = 1
-  !$omp end target teams
-
-  ! ----------------------------------------------------------------------------
-  ! TASK
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.task
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp task
-  !$omp end task
-
-  ! CHECK:      omp.task
-  ! CHECK-SAME: if({{.*}})
-  !$omp task if(.true.)
-  !$omp end task
-
-  ! CHECK:      omp.task
-  ! CHECK-SAME: if({{.*}})
-  !$omp task if(task: .true.)
-  !$omp end task
-
-  ! ----------------------------------------------------------------------------
-  ! TEAMS
-  ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.teams
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
-  !$omp teams
-  i = 1
-  !$omp end teams
-
-  ! CHECK:      omp.teams
-  ! CHECK-SAME: if({{.*}})
-  !$omp teams if(.true.)
-  i = 1
-  !$omp end teams
-
-  ! CHECK:      omp.teams
-  ! CHECK-SAME: if({{.*}})
-  !$omp teams if(teams: .true.)
-  i = 1
-  !$omp end teams
-end program main
diff --git a/flang/test/Lower/OpenMP/FIR/is-device.f90 b/flang/test/Lower/OpenMP/FIR/is-device.f90
deleted file mode 100644
index 79e0ee506c5f..000000000000
--- a/flang/test/Lower/OpenMP/FIR/is-device.f90
+++ /dev/null
@@ -1,14 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DEVICE
-!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s --check-prefix=HOST
-!RUN: %flang_fc1 -emit-fir -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DEVICE-FLAG-ONLY
-!RUN: bbc -fopenmp -fopenmp-is-target-device -emit-fir -o - %s | FileCheck %s --check-prefix=DEVICE
-!RUN: bbc -fopenmp -emit-fir -o - %s | FileCheck %s --check-prefix=HOST
-!RUN: bbc -fopenmp-is-target-device -emit-fir -o - %s | FileCheck %s --check-prefix=DEVICE-FLAG-ONLY
-
-!DEVICE: module attributes {{{.*}}, omp.is_target_device = true{{.*}}}
-!HOST: module attributes {{{.*}}, omp.is_target_device = false{{.*}}}
-!DEVICE-FLAG-ONLY: module attributes {{{.*}}"
-!DEVICE-FLAG-ONLY-NOT: , omp.is_target_device = {{.*}}
-!DEVICE-FLAG-ONLY-SAME: }
-subroutine omp_subroutine()
-end subroutine omp_subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/lastprivate-commonblock.f90 b/flang/test/Lower/OpenMP/FIR/lastprivate-commonblock.f90
deleted file mode 100644
index 86c4d917fa51..000000000000
--- a/flang/test/Lower/OpenMP/FIR/lastprivate-commonblock.f90
+++ /dev/null
@@ -1,49 +0,0 @@
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s 
-
-!CHECK: func.func @_QPlastprivate_common() {
-!CHECK: %[[val_0:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-!CHECK: %[[val_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFlastprivate_commonEi"}
-!CHECK: %[[val_2:.*]] = fir.address_of(@c_) : !fir.ref<!fir.array<8xi8>>
-!CHECK: %[[val_3:.*]] = fir.convert %[[val_2]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c0:.*]] = arith.constant 0 : index
-!CHECK: %[[val_4:.*]] = fir.coordinate_of %[[val_3]], %[[val_c0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_5:.*]] = fir.convert %[[val_4]] : (!fir.ref<i8>) -> !fir.ref<f32>
-!CHECK: %[[val_6:.*]] = fir.convert %[[val_2]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c4:.*]] = arith.constant 4 : index
-!CHECK: %[[val_7:.*]] = fir.coordinate_of %[[val_6]], %[[val_c4]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_8:.*]] = fir.convert %[[val_7]] : (!fir.ref<i8>) -> !fir.ref<f32>
-!CHECK: %[[val_9:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivate_commonEx"}
-!CHECK: %[[val_10:.*]] = fir.alloca f32 {bindc_name = "y", pinned, uniq_name = "_QFlastprivate_commonEy"}
-!CHECK: %[[val_c1_i32:.*]] = arith.constant 1 : i32
-!CHECK: %[[val_c100_i32:.*]] = arith.constant 100 : i32
-!CHECK: %[[val_c1_i32_0:.*]] = arith.constant 1 : i32
-!CHECK: omp.wsloop {
-!CHECK-NEXT: omp.loop_nest (%[[arg:.*]]) : i32 = (%[[val_c1_i32]]) to (%[[val_c100_i32]]) inclusive step (%[[val_c1_i32_0]]) {
-!CHECK: fir.store %[[arg]] to %[[val_0]] : !fir.ref<i32>
-!CHECK: %[[val_11:.*]] = arith.addi %[[arg]], %[[val_c1_i32_0]] : i32
-!CHECK: %[[val_c0_i32:.*]] = arith.constant 0 : i32
-!CHECK: %[[val_12:.*]] = arith.cmpi slt, %[[val_c1_i32_0]], %[[val_c0_i32]] : i32
-!CHECK: %[[val_13:.*]] = arith.cmpi slt, %[[val_11]], %[[val_c100_i32]] : i32
-!CHECK: %[[val_14:.*]] = arith.cmpi sgt, %[[val_11]], %[[val_c100_i32]] : i32
-!CHECK: %[[val_15:.*]] = arith.select %[[val_12]], %[[val_13]], %[[val_14]] : i1
-!CHECK: fir.if %[[val_15]] {
-!CHECK: fir.store %[[val_11]] to %[[val_0]] : !fir.ref<i32>
-!CHECK: %[[val_16:.*]] = fir.load %[[val_9]] : !fir.ref<f32>
-!CHECK: fir.store %[[val_16]] to %[[val_5]] : !fir.ref<f32>
-!CHECK: %[[val_17:.*]] = fir.load %[[val_10]] : !fir.ref<f32>
-!CHECK: fir.store %[[val_17]] to %[[val_8]] : !fir.ref<f32>
-!CHECK: }
-!CHECK: omp.yield
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: return
-!CHECK: }
-subroutine lastprivate_common
-  common /c/ x, y
-  real x, y
-  !$omp do lastprivate(/c/)
-  do i=1,100
-  end do
-  !$omp end do
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/location.f90 b/flang/test/Lower/OpenMP/FIR/location.f90
deleted file mode 100644
index 6a7fb3c03584..000000000000
--- a/flang/test/Lower/OpenMP/FIR/location.f90
+++ /dev/null
@@ -1,71 +0,0 @@
-! This test checks location of OpenMP constructs and clauses
-
-!RUN: %flang_fc1 -emit-fir -fopenmp -mmlir --mlir-print-debuginfo %s -o - | FileCheck %s
-
-!CHECK-LABEL: sub_parallel
-subroutine sub_parallel()
-  print *, x
-!CHECK: omp.parallel   {
-  !$omp parallel
-    print *, x
-!CHECK:   omp.terminator loc(#[[PAR_LOC:.*]])
-!CHECK: } loc(#[[PAR_LOC]])
-  !$omp end parallel
-  print *, x
-end
-
-!CHECK-LABEL: sub_target
-subroutine sub_target()
-  print *, x
-!CHECK: omp.target {{.*}} {
-  !$omp target
-    print *, x
-!CHECK:   omp.terminator loc(#[[TAR_LOC:.*]])
-!CHECK: } loc(#[[TAR_LOC]])
-  !$omp end target
-  print *, x
-end
-
-!CHECK-LABEL: sub_loop
-subroutine sub_loop()
-!CHECK: omp.wsloop {
-!CHECK-NEXT: omp.loop_nest {{.*}} {
-  !$omp do
-  do i=1,10
-    print *, i
-!CHECK:   omp.yield loc(#[[LOOP_LOC:.*]])
-!CHECK: } loc(#[[LOOP_LOC]])
-!CHECK:   omp.terminator loc(#[[LOOP_LOC]])
-!CHECK: } loc(#[[LOOP_LOC]])
-  end do
-  !$omp end do
-end
-
-!CHECK-LABEL: sub_standalone
-subroutine sub_standalone()
-  !CHECK: omp.barrier loc(#[[BAR_LOC:.*]])
-  !$omp barrier
-  !CHECK: omp.taskwait loc(#[[TW_LOC:.*]])
-  !$omp taskwait
-  !CHECK: omp.taskyield loc(#[[TY_LOC:.*]])
-  !$omp taskyield
-end
-
-subroutine sub_if(c)
-  logical(kind=4) :: c
-  !CHECK: %[[CVT:.*]] = fir.convert %{{.*}} : (!fir.logical<4>) -> i1 loc(#[[IF_LOC:.*]])
-  !CHECK: omp.task if(%[[CVT]])
-  !$omp task if(c)
-    print *, "Task"
-  !$omp end task
-  !CHECK: } loc(#[[TASK_LOC:.*]])
-end subroutine
-
-!CHECK: #[[PAR_LOC]] = loc("{{.*}}location.f90":9:9)
-!CHECK: #[[TAR_LOC]] = loc("{{.*}}location.f90":21:9)
-!CHECK: #[[LOOP_LOC]] = loc("{{.*}}location.f90":33:9)
-!CHECK: #[[BAR_LOC]] = loc("{{.*}}location.f90":47:9)
-!CHECK: #[[TW_LOC]] = loc("{{.*}}location.f90":49:9)
-!CHECK: #[[TY_LOC]] = loc("{{.*}}location.f90":51:9)
-!CHECK: #[[IF_LOC]] = loc("{{.*}}location.f90":58:14)
-!CHECK: #[[TASK_LOC]] = loc("{{.*}}location.f90":58:9)
diff --git a/flang/test/Lower/OpenMP/FIR/loop-combined.f90 b/flang/test/Lower/OpenMP/FIR/loop-combined.f90
deleted file mode 100644
index 6c6618dc9fb5..000000000000
--- a/flang/test/Lower/OpenMP/FIR/loop-combined.f90
+++ /dev/null
@@ -1,83 +0,0 @@
-! This test checks lowering of OpenMP combined loop constructs.
-
-! RUN: bbc -fopenmp -emit-fir %s -o - | FileCheck %s
-! RUN: %flang_fc1 -fopenmp -emit-fir %s -o - | FileCheck %s
-
-program main
-  integer :: i
-
-  ! TODO When DISTRIBUTE, TASKLOOP and TEAMS are supported add:
-  ! - DISTRIBUTE PARALLEL DO SIMD
-  ! - DISTRIBUTE PARALLEL DO
-  ! - DISTRIBUTE SIMD
-  ! - TARGET TEAMS DISTRIBUTE PARALLEL DO SIMD
-  ! - TARGET TEAMS DISTRIBUTE PARALLEL DO
-  ! - TARGET TEAMS DISTRIBUTE SIMD
-  ! - TARGET TEAMS DISTRIBUTE
-  ! - TASKLOOP SIMD
-  ! - TEAMS DISTRIBUTE PARALLEL DO SIMD
-  ! - TEAMS DISTRIBUTE PARALLEL DO
-  ! - TEAMS DISTRIBUTE SIMD
-  ! - TEAMS DISTRIBUTE
-
-  ! ----------------------------------------------------------------------------
-  ! DO SIMD
-  ! ----------------------------------------------------------------------------
-  ! CHECK: omp.wsloop
-  !$omp do simd
-  do i = 1, 10
-  end do
-  !$omp end do simd
-
-  ! ----------------------------------------------------------------------------
-  ! PARALLEL DO SIMD
-  ! ----------------------------------------------------------------------------
-  ! CHECK: omp.parallel
-  ! CHECK: omp.wsloop
-  !$omp parallel do simd
-  do i = 1, 10
-  end do
-  !$omp end parallel do simd
-
-  ! ----------------------------------------------------------------------------
-  ! PARALLEL DO
-  ! ----------------------------------------------------------------------------
-  ! CHECK: omp.parallel
-  ! CHECK: omp.wsloop
-  !$omp parallel do
-  do i = 1, 10
-  end do
-  !$omp end parallel do
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET PARALLEL DO SIMD
-  ! ----------------------------------------------------------------------------
-  ! CHECK: omp.target
-  ! CHECK: omp.parallel
-  ! CHECK: omp.wsloop
-  !$omp target parallel do simd
-  do i = 1, 10
-  end do
-  !$omp end target parallel do simd
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET PARALLEL DO
-  ! ----------------------------------------------------------------------------
-  ! CHECK: omp.target
-  ! CHECK: omp.parallel
-  ! CHECK: omp.wsloop
-  !$omp target parallel do
-  do i = 1, 10
-  end do
-  !$omp end target parallel do
-
-  ! ----------------------------------------------------------------------------
-  ! TARGET SIMD
-  ! ----------------------------------------------------------------------------
-  ! CHECK: omp.target
-  ! CHECK: omp.simd
-  !$omp target simd
-  do i = 1, 10
-  end do
-  !$omp end target simd
-end program main
diff --git a/flang/test/Lower/OpenMP/FIR/map-component-ref.f90 b/flang/test/Lower/OpenMP/FIR/map-component-ref.f90
deleted file mode 100644
index 6799941701f4..000000000000
--- a/flang/test/Lower/OpenMP/FIR/map-component-ref.f90
+++ /dev/null
@@ -1,33 +0,0 @@
-! RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s
-! RUN: bbc -fopenmp -emit-fir %s -o - | FileCheck %s
-
-! CHECK: %[[V0:[0-9]+]] = fir.alloca !fir.type<_QFfooTt0{a0:i32,a1:i32}> {bindc_name = "a", uniq_name = "_QFfooEa"}
-! CHECK: %[[V1:[0-9]+]] = fir.declare %[[V0]] {uniq_name = "_QFfooEa"} : (!fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>) -> !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>
-! CHECK: %[[V2:[0-9]+]] = fir.field_index a1, !fir.type<_QFfooTt0{a0:i32,a1:i32}>
-! CHECK: %[[V3:[0-9]+]] = fir.coordinate_of %[[V1]], %[[V2]] : (!fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>, !fir.field) -> !fir.ref<i32>
-! CHECK: %[[V4:[0-9]+]] = omp.map.info var_ptr(%[[V3]] : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "a%a1"}
-! CHECK: %[[V5:[0-9]+]] = omp.map.info var_ptr(%[[V1]] : !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>, !fir.type<_QFfooTt0{a0:i32,a1:i32}>) map_clauses(implicit, tofrom) capture(ByRef) -> !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>> {name = "a"}
-! CHECK: omp.target map_entries(%[[V4]] -> %arg0, %[[V5]] -> %arg1 : !fir.ref<i32>, !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>) {
-! CHECK: ^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>):
-! CHECK:   %c0_i32 = arith.constant 0 : i32
-! CHECK:   %[[V6:[0-9]+]] = fir.declare %arg1 {uniq_name = "_QFfooEa"} : (!fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>) -> !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>
-! CHECK:   %[[V7:[0-9]+]] = fir.field_index a1, !fir.type<_QFfooTt0{a0:i32,a1:i32}>
-! CHECK:   %[[V8:[0-9]+]] = fir.coordinate_of %[[V6]], %[[V7]] : (!fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>, !fir.field) -> !fir.ref<i32>
-! CHECK:   fir.store %c0_i32 to %[[V8]] : !fir.ref<i32>
-! CHECK:   omp.terminator
-! CHECK: }
-
-subroutine foo()
-  implicit none
-
-  type t0
-    integer :: a0, a1
-  end type
-
-  type(t0) :: a
-
-  !$omp target map(a%a1)
-  a%a1 = 0
-  !$omp end target
-end
-
diff --git a/flang/test/Lower/OpenMP/FIR/master.f90 b/flang/test/Lower/OpenMP/FIR/master.f90
deleted file mode 100644
index dd9910da2f41..000000000000
--- a/flang/test/Lower/OpenMP/FIR/master.f90
+++ /dev/null
@@ -1,100 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes="FIRDialect,OMPDialect"
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --cfg-conversion | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefixes="OMPDialect"
-
-!===============================================================================
-! parallel construct with function call which has master construct internally
-!===============================================================================
-!FIRDialect-LABEL: func @_QPomp_master
-subroutine omp_master()
-
-!OMPDialect: omp.master  {
-!$omp master
-
-    !FIRDialect: fir.call @_QPmaster() {{.*}}: () -> ()
-    call master()
-
-!OMPDialect: omp.terminator
-!$omp end master
-
-end subroutine omp_master
-
-!FIRDialect-LABEL: func @_QPparallel_function_master
-subroutine parallel_function_master()
-
-!OMPDialect: omp.parallel {
-!$omp parallel
-
-    !FIRDialect: fir.call @_QPfoo() {{.*}}: () -> ()
-    call foo()
-
-!OMPDialect: omp.terminator
-!$omp end parallel
-
-end subroutine parallel_function_master
-
-!===============================================================================
-! master construct nested inside parallel construct
-!===============================================================================
-
-!FIRDialect-LABEL: func @_QPomp_parallel_master
-subroutine omp_parallel_master()
-
-!OMPDialect: omp.parallel {
-!$omp parallel
-    !FIRDialect: fir.call @_QPparallel() {{.*}}: () -> ()
-    call parallel()
-
-!OMPDialect: omp.master {
-!$omp master
-
-    !FIRDialect: fir.call @_QPparallel_master() {{.*}}: () -> ()
-    call parallel_master()
-
-!OMPDialect: omp.terminator
-!$omp end master
-
-!OMPDialect: omp.terminator
-!$omp end parallel
-
-end subroutine omp_parallel_master
-
-!===============================================================================
-! master construct nested inside parallel construct with conditional flow
-!===============================================================================
-
-!FIRDialect-LABEL: func @_QPomp_master_parallel
-subroutine omp_master_parallel()
-    integer :: alpha, beta, gama
-    alpha = 4
-    beta = 5
-    gama = 6
-
-!OMPDialect: omp.master {
-!$omp master
-
-    !FIRDialect: %{{.*}} = fir.load %{{.*}}
-    !FIRDialect: %{{.*}} = fir.load %{{.*}}
-    !FIRDialect: %[[RESULT:.*]] = arith.cmpi sge, %{{.*}}, %{{.*}}
-    !FIRDialect: fir.if %[[RESULT]] {
-    if (alpha .ge. gama) then
-
-!OMPDialect: omp.parallel {
-!$omp parallel
-        !FIRDialect: fir.call @_QPinside_if_parallel() {{.*}}: () -> ()
-        call inside_if_parallel()
-
-!OMPDialect: omp.terminator
-!$omp end parallel
-
-        !FIRDialect: %{{.*}} = fir.load %{{.*}}
-        !FIRDialect: %{{.*}} = fir.load %{{.*}}
-        !FIRDialect: %{{.*}} = arith.addi %{{.*}}, %{{.*}}
-        !FIRDialect: fir.store %{{.*}} to %{{.*}}
-        beta = alpha + gama
-    end if
-    !FIRDialect: else
-
-!OMPDialect: omp.terminator
-!$omp end master
-
-end subroutine omp_master_parallel
diff --git a/flang/test/Lower/OpenMP/FIR/omp-declare-target-program-var.f90 b/flang/test/Lower/OpenMP/FIR/omp-declare-target-program-var.f90
deleted file mode 100644
index 0da76f6d9ad2..000000000000
--- a/flang/test/Lower/OpenMP/FIR/omp-declare-target-program-var.f90
+++ /dev/null
@@ -1,12 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s --check-prefixes=HOST,ALL
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=ALL
-
-PROGRAM main
-    ! HOST-DAG: %0 = fir.alloca f32 {bindc_name = "i", uniq_name = "_QFEi"}
-    REAL :: I
-    ! ALL-DAG: fir.global internal @_QFEi {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : f32 {
-    ! ALL-DAG: %0 = fir.undefined f32
-    ! ALL-DAG: fir.has_value %0 : f32
-    ! ALL-DAG: }
-    !$omp declare target(I)
-END
diff --git a/flang/test/Lower/OpenMP/FIR/omp-is-gpu.f90 b/flang/test/Lower/OpenMP/FIR/omp-is-gpu.f90
deleted file mode 100644
index ac8d24974801..000000000000
--- a/flang/test/Lower/OpenMP/FIR/omp-is-gpu.f90
+++ /dev/null
@@ -1,16 +0,0 @@
-!REQUIRES: amdgpu-registered-target, nvptx-registered-target
-
-!RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s
-!RUN: %flang_fc1 -triple nvptx64-nvidia-cuda -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s
-!RUN: bbc -fopenmp -fopenmp-is-target-device -fopenmp-is-gpu -emit-fir -o - %s | FileCheck %s
-
-!RUN: not %flang_fc1 -triple amdgcn-amd-amdhsa -emit-fir -fopenmp %s -o - 2>&1 | FileCheck %s --check-prefix=FLANG-ERROR
-!RUN: not %flang_fc1 -triple nvptx64-nvidia-cuda -emit-fir -fopenmp %s -o - 2>&1 | FileCheck %s --check-prefix=FLANG-ERROR
-!RUN: not bbc -fopenmp -fopenmp-is-gpu -emit-fir %s -o - 2>&1 | FileCheck %s --check-prefix=BBC-ERROR
-
-!CHECK: module attributes {{{.*}}omp.is_gpu = true
-subroutine omp_subroutine()
-end subroutine omp_subroutine
-
-!FLANG-ERROR: error: OpenMP AMDGPU/NVPTX is only prepared to deal with device code.
-!BBC-ERROR: FATAL: -fopenmp-is-gpu can only be set if -fopenmp-is-target-device is also set
diff --git a/flang/test/Lower/OpenMP/FIR/ordered-threads.f90 b/flang/test/Lower/OpenMP/FIR/ordered-threads.f90
deleted file mode 100644
index 2dea4c857e87..000000000000
--- a/flang/test/Lower/OpenMP/FIR/ordered-threads.f90
+++ /dev/null
@@ -1,40 +0,0 @@
-! This test checks lowering of OpenMP ordered directive with threads Clause.
-! Without clause in ordered direcitve, it behaves as if threads clause is
-! specified.
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s --check-prefix=FIRDialect
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefix=LLVMIRDialect
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --fir-to-llvm-ir | tco | FileCheck %s --check-prefix=LLVMIR
-
-subroutine ordered
-        integer :: i
-        integer :: a(20)
-
-!FIRDialect: omp.ordered.region  {
-!LLVMIRDialect: omp.ordered.region  {
-!LLVMIR: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB0:[0-9]+]])
-!LLVMIR-NEXT: call void @__kmpc_ordered(ptr @[[GLOB0]], i32 [[TMP0]])
-!$OMP ORDERED
-        a(i) = a(i-1) + 1
-!FIRDialect:   omp.terminator
-!FIRDialect-NEXT: }
-!LLVMIRDialect:   omp.terminator
-!LLVMIRDialect-NEXT: }
-!LLVMIR: call void @__kmpc_end_ordered(ptr @[[GLOB0]], i32 [[TMP0]])
-!$OMP END ORDERED
-
-!FIRDialect: omp.ordered.region  {
-!LLVMIRDialect: omp.ordered.region  {
-!LLVMIR: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
-!LLVMIR-NEXT: call void @__kmpc_ordered(ptr @[[GLOB1]], i32 [[TMP1]])
-!$OMP ORDERED THREADS
-        a(i) = a(i-1) + 1
-!FIRDialect:   omp.terminator
-!FIRDialect-NEXT: }
-!LLVMIRDialect:   omp.terminator
-!LLVMIRDialect-NEXT: }
-!LLVMIR: call void @__kmpc_end_ordered(ptr @[[GLOB1]], i32 [[TMP1]])
-!LLVMIR-NEXT: ret void
-!$OMP END ORDERED
-
-end
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-firstprivate-clause-scalar.f90 b/flang/test/Lower/OpenMP/FIR/parallel-firstprivate-clause-scalar.f90
deleted file mode 100644
index 37f916ecb84c..000000000000
--- a/flang/test/Lower/OpenMP/FIR/parallel-firstprivate-clause-scalar.f90
+++ /dev/null
@@ -1,159 +0,0 @@
-! This test checks lowering of `FIRSTPRIVATE` clause for scalar types.
-
-! REQUIRES: shell
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s --check-prefix=FIRDialect
-
-!FIRDialect-DAG: func @_QPfirstprivate_complex(%[[ARG1:.*]]: !fir.ref<!fir.complex<4>>{{.*}}, %[[ARG2:.*]]: !fir.ref<!fir.complex<8>>{{.*}}) {
-!FIRDialect:   omp.parallel {
-!FIRDialect:     %[[ARG1_PVT:.*]] = fir.alloca !fir.complex<4> {bindc_name = "arg1", pinned, uniq_name = "_QFfirstprivate_complexEarg1"}
-!FIRDialect:     %[[ARG1_VAL:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.complex<4>>
-!FIRDialect:     fir.store %[[ARG1_VAL]] to %[[ARG1_PVT]] : !fir.ref<!fir.complex<4>>
-!FIRDialect:     %[[ARG2_PVT:.*]] = fir.alloca !fir.complex<8> {bindc_name = "arg2", pinned, uniq_name = "_QFfirstprivate_complexEarg2"}
-!FIRDialect:     %[[ARG2_VAL:.*]] = fir.load %[[ARG2]] : !fir.ref<!fir.complex<8>>
-!FIRDialect:     fir.store %[[ARG2_VAL]] to %[[ARG2_PVT]] : !fir.ref<!fir.complex<8>>
-!FIRDialect:     fir.call @_QPfoo(%[[ARG1_PVT]], %[[ARG2_PVT]]) {{.*}}: (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<8>>) -> ()
-!FIRDialect:     omp.terminator
-!FIRDialect:   }
-
-subroutine firstprivate_complex(arg1, arg2)
-        complex(4) :: arg1
-        complex(8) :: arg2
-
-!$OMP PARALLEL FIRSTPRIVATE(arg1, arg2)
-        call foo(arg1, arg2)
-!$OMP END PARALLEL
-
-end subroutine
-
-!FIRDialect-DAG: func @_QPfirstprivate_integer(%[[ARG1:.*]]: !fir.ref<i32>{{.*}}, %[[ARG2:.*]]: !fir.ref<i8>{{.*}}, %[[ARG3:.*]]: !fir.ref<i16>{{.*}}, %[[ARG4:.*]]: !fir.ref<i32>{{.*}}, %[[ARG5:.*]]: !fir.ref<i64>{{.*}}, %[[ARG6:.*]]: !fir.ref<i128>{{.*}}) {
-!FIRDialect:  omp.parallel {
-!FIRDialect:    %[[ARG1_PVT:.*]] = fir.alloca i32 {bindc_name = "arg1", pinned, uniq_name = "_QFfirstprivate_integerEarg1"}
-!FIRDialect:    %[[ARG1_VAL:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-!FIRDialect:    fir.store %[[ARG1_VAL]] to %[[ARG1_PVT]] : !fir.ref<i32>
-!FIRDialect:    %[[ARG2_PVT:.*]] = fir.alloca i8 {bindc_name = "arg2", pinned, uniq_name = "_QFfirstprivate_integerEarg2"}
-!FIRDialect:    %[[ARG2_VAL:.*]] = fir.load %[[ARG2]] : !fir.ref<i8>
-!FIRDialect:    fir.store %[[ARG2_VAL]] to %[[ARG2_PVT]] : !fir.ref<i8>
-!FIRDialect:    %[[ARG3_PVT:.*]] = fir.alloca i16 {bindc_name = "arg3", pinned, uniq_name = "_QFfirstprivate_integerEarg3"}
-!FIRDialect:    %[[ARG3_VAL:.*]] = fir.load %[[ARG3]] : !fir.ref<i16>
-!FIRDialect:    fir.store %[[ARG3_VAL]] to %[[ARG3_PVT]] : !fir.ref<i16>
-!FIRDialect:    %[[ARG4_PVT:.*]] = fir.alloca i32 {bindc_name = "arg4", pinned, uniq_name = "_QFfirstprivate_integerEarg4"}
-!FIRDialect:    %[[ARG4_VAL:.*]] = fir.load %[[ARG4]] : !fir.ref<i32>
-!FIRDialect:    fir.store %[[ARG4_VAL]] to %[[ARG4_PVT]] : !fir.ref<i32>
-!FIRDialect:    %[[ARG5_PVT:.*]] = fir.alloca i64 {bindc_name = "arg5", pinned, uniq_name = "_QFfirstprivate_integerEarg5"}
-!FIRDialect:    %[[ARG5_VAL:.*]] = fir.load %[[ARG5]] : !fir.ref<i64>
-!FIRDialect:    fir.store %[[ARG5_VAL]] to %[[ARG5_PVT]] : !fir.ref<i64>
-!FIRDialect:    %[[ARG6_PVT:.*]] = fir.alloca i128 {bindc_name = "arg6", pinned, uniq_name = "_QFfirstprivate_integerEarg6"}
-!FIRDialect:    %[[ARG6_VAL:.*]] = fir.load %[[ARG6]] : !fir.ref<i128>
-!FIRDialect:    fir.store %[[ARG6_VAL]] to %[[ARG6_PVT]] : !fir.ref<i128>
-!FIRDialect:    fir.call @_QPbar(%[[ARG1_PVT]], %[[ARG2_PVT]], %[[ARG3_PVT]], %[[ARG4_PVT]], %[[ARG5_PVT]], %[[ARG6_PVT]]) {{.*}}: (!fir.ref<i32>, !fir.ref<i8>, !fir.ref<i16>, !fir.ref<i32>, !fir.ref<i64>, !fir.ref<i128>) -> ()
-!FIRDialect:    omp.terminator
-!FIRDialect:  }
-
-subroutine firstprivate_integer(arg1, arg2, arg3, arg4, arg5, arg6)
-        integer :: arg1
-        integer(kind=1) :: arg2
-        integer(kind=2) :: arg3
-        integer(kind=4) :: arg4
-        integer(kind=8) :: arg5
-        integer(kind=16) :: arg6
-
-!$OMP PARALLEL FIRSTPRIVATE(arg1, arg2, arg3, arg4, arg5, arg6)
-        call bar(arg1, arg2, arg3, arg4, arg5, arg6)
-!$OMP END PARALLEL
-
-end subroutine
-
-!FIRDialect-DAG: func @_QPfirstprivate_logical(%[[ARG1:.*]]: !fir.ref<!fir.logical<4>>{{.*}}, %[[ARG2:.*]]: !fir.ref<!fir.logical<1>>{{.*}}, %[[ARG3:.*]]: !fir.ref<!fir.logical<2>>{{.*}}, %[[ARG4:.*]]: !fir.ref<!fir.logical<4>>{{.*}}, %[[ARG5:.*]]: !fir.ref<!fir.logical<8>>{{.*}}) {
-!FIRDialect:   omp.parallel {
-!FIRDialect:     %[[ARG1_PVT:.*]] = fir.alloca !fir.logical<4> {bindc_name = "arg1", pinned, uniq_name = "_QFfirstprivate_logicalEarg1"}
-!FIRDialect:     %[[ARG1_VAL:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
-!FIRDialect:     fir.store %[[ARG1_VAL]] to %[[ARG1_PVT]] : !fir.ref<!fir.logical<4>>
-!FIRDialect:     %[[ARG2_PVT:.*]] = fir.alloca !fir.logical<1> {bindc_name = "arg2", pinned, uniq_name = "_QFfirstprivate_logicalEarg2"}
-!FIRDialect:     %[[ARG2_VAL:.*]] = fir.load %[[ARG2]] : !fir.ref<!fir.logical<1>>
-!FIRDialect:     fir.store %[[ARG2_VAL]] to %[[ARG2_PVT]] : !fir.ref<!fir.logical<1>>
-!FIRDialect:     %[[ARG3_PVT:.*]] = fir.alloca !fir.logical<2> {bindc_name = "arg3", pinned, uniq_name = "_QFfirstprivate_logicalEarg3"}
-!FIRDialect:     %[[ARG3_VAL:.*]] = fir.load %[[ARG3]] : !fir.ref<!fir.logical<2>>
-!FIRDialect:     fir.store %[[ARG3_VAL]] to %[[ARG3_PVT]] : !fir.ref<!fir.logical<2>>
-!FIRDialect:     %[[ARG4_PVT:.*]] = fir.alloca !fir.logical<4> {bindc_name = "arg4", pinned, uniq_name = "_QFfirstprivate_logicalEarg4"}
-!FIRDialect:     %[[ARG4_VAL:.*]] = fir.load %[[ARG4]] : !fir.ref<!fir.logical<4>>
-!FIRDialect:     fir.store %[[ARG4_VAL]] to %[[ARG4_PVT]] : !fir.ref<!fir.logical<4>>
-!FIRDialect:     %[[ARG5_PVT:.*]] = fir.alloca !fir.logical<8> {bindc_name = "arg5", pinned, uniq_name = "_QFfirstprivate_logicalEarg5"}
-!FIRDialect:     %[[ARG5_VAL:.*]] = fir.load %[[ARG5]] : !fir.ref<!fir.logical<8>>
-!FIRDialect:     fir.store %[[ARG5_VAL]] to %[[ARG5_PVT]] : !fir.ref<!fir.logical<8>>
-!FIRDialect:     fir.call @_QPbaz(%[[ARG1_PVT]], %[[ARG2_PVT]], %[[ARG3_PVT]], %[[ARG4_PVT]], %[[ARG5_PVT]]) {{.*}}: (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<1>>, !fir.ref<!fir.logical<2>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<8>>) -> ()
-!FIRDialect:     omp.terminator
-!FIRDialect:   }
-
-subroutine firstprivate_logical(arg1, arg2, arg3, arg4, arg5)
-        logical :: arg1
-        logical(kind=1) :: arg2
-        logical(kind=2) :: arg3
-        logical(kind=4) :: arg4
-        logical(kind=8) :: arg5
-
-!$OMP PARALLEL FIRSTPRIVATE(arg1, arg2, arg3, arg4, arg5)
-        call baz(arg1, arg2, arg3, arg4, arg5)
-!$OMP END PARALLEL
-
-end subroutine
-
-!FIRDialect-DAG: func @_QPfirstprivate_real(%[[ARG1:.*]]: !fir.ref<f32>{{.*}}, %[[ARG2:.*]]: !fir.ref<f16>{{.*}}, %[[ARG3:.*]]: !fir.ref<f32>{{.*}}, %[[ARG4:.*]]: !fir.ref<f64>{{.*}}, %[[ARG5:.*]]: !fir.ref<f80>{{.*}}, %[[ARG6:.*]]: !fir.ref<f128>{{.*}}) {
-!FIRDialect:   omp.parallel {
-!FIRDialect:     %[[ARG1_PVT:.*]] = fir.alloca f32 {bindc_name = "arg1", pinned, uniq_name = "_QFfirstprivate_realEarg1"}
-!FIRDialect:     %[[ARG1_VAL:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
-!FIRDialect:     fir.store %[[ARG1_VAL]] to %[[ARG1_PVT]] : !fir.ref<f32>
-!FIRDialect:     %[[ARG2_PVT:.*]] = fir.alloca f16 {bindc_name = "arg2", pinned, uniq_name = "_QFfirstprivate_realEarg2"}
-!FIRDialect:     %[[ARG2_VAL:.*]] = fir.load %[[ARG2]] : !fir.ref<f16>
-!FIRDialect:     fir.store %[[ARG2_VAL]] to %[[ARG2_PVT]] : !fir.ref<f16>
-!FIRDialect:     %[[ARG3_PVT:.*]] = fir.alloca f32 {bindc_name = "arg3", pinned, uniq_name = "_QFfirstprivate_realEarg3"}
-!FIRDialect:     %[[ARG3_VAL:.*]] = fir.load %[[ARG3]] : !fir.ref<f32>
-!FIRDialect:     fir.store %[[ARG3_VAL]] to %[[ARG3_PVT]] : !fir.ref<f32>
-!FIRDialect:     %[[ARG4_PVT:.*]] = fir.alloca f64 {bindc_name = "arg4", pinned, uniq_name = "_QFfirstprivate_realEarg4"}
-!FIRDialect:     %[[ARG4_VAL:.*]] = fir.load %[[ARG4]] : !fir.ref<f64>
-!FIRDialect:     fir.store %[[ARG4_VAL]] to %[[ARG4_PVT]] : !fir.ref<f64>
-!FIRDialect:     %[[ARG5_PVT:.*]] = fir.alloca f80 {bindc_name = "arg5", pinned, uniq_name = "_QFfirstprivate_realEarg5"}
-!FIRDialect:     %[[ARG5_VAL:.*]] = fir.load %[[ARG5]] : !fir.ref<f80>
-!FIRDialect:     fir.store %[[ARG5_VAL]] to %[[ARG5_PVT]] : !fir.ref<f80>
-!FIRDialect:     %[[ARG6_PVT:.*]] = fir.alloca f128 {bindc_name = "arg6", pinned, uniq_name = "_QFfirstprivate_realEarg6"}
-!FIRDialect:     %[[ARG6_VAL:.*]] = fir.load %[[ARG6]] : !fir.ref<f128>
-!FIRDialect:     fir.store %[[ARG6_VAL]] to %[[ARG6_PVT]] : !fir.ref<f128>
-!FIRDialect:     fir.call @_QPqux(%[[ARG1_PVT]], %[[ARG2_PVT]], %[[ARG3_PVT]], %[[ARG4_PVT]], %[[ARG5_PVT]], %[[ARG6_PVT]]) {{.*}}: (!fir.ref<f32>, !fir.ref<f16>, !fir.ref<f32>, !fir.ref<f64>, !fir.ref<f80>, !fir.ref<f128>) -> ()
-!FIRDialect:     omp.terminator
-!FIRDialect:   }
-
-subroutine firstprivate_real(arg1, arg2, arg3, arg4, arg5, arg6)
-        real :: arg1
-        real(kind=2) :: arg2
-        real(kind=4) :: arg3
-        real(kind=8) :: arg4
-        real(kind=10) :: arg5
-        real(kind=16) :: arg6
-
-!$OMP PARALLEL FIRSTPRIVATE(arg1, arg2, arg3, arg4, arg5, arg6)
-        call qux(arg1, arg2, arg3, arg4, arg5, arg6)
-!$OMP END PARALLEL
-
-end subroutine
-
-!FIRDialect-LABEL:   func.func @_QPmultiple_firstprivate(
-!FIRDialect-SAME:                                        %[[A_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
-!FIRDialect-SAME:                                        %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) {
-!FIRDialect:           omp.parallel   {
-!FIRDialect:             %[[A_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFmultiple_firstprivateEa"}
-!FIRDialect:             %[[A:.*]] = fir.load %[[A_ADDR]] : !fir.ref<i32>
-!FIRDialect:             fir.store %[[A]] to %[[A_PRIV_ADDR]] : !fir.ref<i32>
-!FIRDialect:             %[[B_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "b", pinned, uniq_name = "_QFmultiple_firstprivateEb"}
-!FIRDialect:             %[[B:.*]] = fir.load %[[B_ADDR]] : !fir.ref<i32>
-!FIRDialect:             fir.store %[[B]] to %[[B_PRIV_ADDR]] : !fir.ref<i32>
-!FIRDialect:             fir.call @_QPquux(%[[A_PRIV_ADDR]], %[[B_PRIV_ADDR]]) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
-!FIRDialect:             omp.terminator
-!FIRDialect:           }
-!FIRDialect:           return
-!FIRDialect:         }
-
-subroutine multiple_firstprivate(a, b)
-        integer :: a, b
-!$OMP PARALLEL FIRSTPRIVATE(a) FIRSTPRIVATE(b)
-        call quux(a, b)
-!$OMP END PARALLEL
-end subroutine multiple_firstprivate
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-lastprivate-clause-scalar.f90 b/flang/test/Lower/OpenMP/FIR/parallel-lastprivate-clause-scalar.f90
deleted file mode 100644
index 16832355f5d1..000000000000
--- a/flang/test/Lower/OpenMP/FIR/parallel-lastprivate-clause-scalar.f90
+++ /dev/null
@@ -1,261 +0,0 @@
-! This test checks lowering of `LASTPRIVATE` clause for scalar types.
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-! RUN: %flang_fc1 -fopenmp -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s
-
-!CHECK: func @_QPlastprivate_character(%[[ARG1:.*]]: !fir.boxchar<1>{{.*}}) {
-!CHECK-DAG: %[[ARG1_UNBOX:.*]]:2 = fir.unboxchar
-!CHECK-DAG: %[[FIVE:.*]] = arith.constant 5 : index
-!CHECK-DAG: %[[ARG1_REF:.*]] = fir.convert %[[ARG1_UNBOX]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,5>>
-
-!CHECK: omp.parallel {
-!CHECK-DAG: %[[ARG1_PVT:.*]] = fir.alloca !fir.char<1,5> {bindc_name = "arg1", 
-
-! Check that we are accessing the clone inside the loop
-!CHECK: omp.wsloop {
-!CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
-!CHECK: %[[UNIT:.*]] = arith.constant 6 : i32
-!CHECK-NEXT: %[[ADDR:.*]] = fir.address_of(@_QQclX
-!CHECK-NEXT: %[[CVT0:.*]] = fir.convert %[[ADDR]] 
-!CHECK-NEXT: %[[CNST:.*]] = arith.constant
-!CHECK-NEXT: %[[CALL_BEGIN_IO:.*]] = fir.call @_FortranAioBeginExternalListOutput(%[[UNIT]], %[[CVT0]], %[[CNST]]) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
-!CHECK-NEXT: %[[CVT_0_1:.*]] = fir.convert %[[ARG1_PVT]] 
-!CHECK-NEXT: %[[CVT_0_2:.*]] = fir.convert %[[FIVE]]
-!CHECK-NEXT: %[[CALL_OP_ASCII:.*]] = fir.call @_FortranAioOutputAscii(%[[CALL_BEGIN_IO]], %[[CVT_0_1]], %[[CVT_0_2]])
-!CHECK-NEXT: %[[CALL_END_IO:.*]] = fir.call @_FortranAioEndIoStatement(%[[CALL_BEGIN_IO]])
-
-! Testing last iteration check
-!CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
-!CHECK: %[[C0:.*]] = arith.constant 0 : i32
-!CHECK: %[[T1:.*]] = arith.cmpi slt, %{{.*}}, %[[C0]] : i32
-!CHECK: %[[T2:.*]] = arith.cmpi slt, %[[V]], %{{.*}} : i32
-!CHECK: %[[T3:.*]] = arith.cmpi sgt, %[[V]], %{{.*}} : i32
-!CHECK: %[[IV_CMP:.*]] = arith.select %[[T1]], %[[T2]], %[[T3]] : i1
-!CHECK: fir.if %[[IV_CMP]] {
-!CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
-
-! Testing lastprivate val update
-!CHECK-DAG: %[[CVT:.*]] = fir.convert %[[ARG1_REF]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
-!CHECK-DAG: %[[CVT1:.*]] = fir.convert %[[ARG1_PVT]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
-!CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[CVT]], %[[CVT1]]{{.*}})
-!CHECK: }
-!CHECK: omp.yield
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-
-subroutine lastprivate_character(arg1)
-        character(5) :: arg1
-!$OMP PARALLEL 
-!$OMP DO LASTPRIVATE(arg1)
-do n = 1, 5
-        arg1(n:n) = 'c'
-        print *, arg1
-end do
-!$OMP END DO
-!$OMP END PARALLEL
-end subroutine
-
-!CHECK: func @_QPlastprivate_int(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}) {
-!CHECK-DAG: omp.parallel  {
-!CHECK-DAG: %[[CLONE:.*]] = fir.alloca i32 {bindc_name = "arg1"
-!CHECK: omp.wsloop {
-!CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
-
-! Testing last iteration check
-!CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
-!CHECK: %[[C0:.*]] = arith.constant 0 : i32
-!CHECK: %[[T1:.*]] = arith.cmpi slt, %{{.*}}, %[[C0]] : i32
-!CHECK: %[[T2:.*]] = arith.cmpi slt, %[[V]], %{{.*}} : i32
-!CHECK: %[[T3:.*]] = arith.cmpi sgt, %[[V]], %{{.*}} : i32
-!CHECK: %[[IV_CMP:.*]] = arith.select %[[T1]], %[[T2]], %[[T3]] : i1
-!CHECK: fir.if %[[IV_CMP]] {
-!CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
-
-! Testing lastprivate val update
-!CHECK-NEXT: %[[CLONE_LD:.*]] = fir.load %[[CLONE]] : !fir.ref<i32>
-!CHECK-NEXT: fir.store %[[CLONE_LD]] to %[[ARG1]] : !fir.ref<i32>
-!CHECK: }
-!CHECK: omp.yield
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-
-subroutine lastprivate_int(arg1)
-        integer :: arg1
-!$OMP PARALLEL 
-!$OMP DO LASTPRIVATE(arg1)
-do n = 1, 5
-        arg1 = 2
-        print *, arg1
-end do
-!$OMP END DO
-!$OMP END PARALLEL
-print *, arg1
-end subroutine
-
-!CHECK: func.func @_QPmult_lastprivate_int(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}, %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "arg2"}) {
-!CHECK: omp.parallel  {
-!CHECK-DAG: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1"
-!CHECK-DAG: %[[CLONE2:.*]] = fir.alloca i32 {bindc_name = "arg2"
-!CHECK: omp.wsloop {
-!CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
-
-! Testing last iteration check
-!CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
-!CHECK: %[[C0:.*]] = arith.constant 0 : i32
-!CHECK: %[[T1:.*]] = arith.cmpi slt, %{{.*}}, %[[C0]] : i32
-!CHECK: %[[T2:.*]] = arith.cmpi slt, %[[V]], %{{.*}} : i32
-!CHECK: %[[T3:.*]] = arith.cmpi sgt, %[[V]], %{{.*}} : i32
-!CHECK: %[[IV_CMP:.*]] = arith.select %[[T1]], %[[T2]], %[[T3]] : i1
-!CHECK: fir.if %[[IV_CMP]] {
-!CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
-! Testing lastprivate val update
-!CHECK-DAG: %[[CLONE_LD1:.*]] = fir.load %[[CLONE1]] : !fir.ref<i32>
-!CHECK-DAG: fir.store %[[CLONE_LD1]] to %[[ARG1]] : !fir.ref<i32>
-!CHECK-DAG: %[[CLONE_LD2:.*]] = fir.load %[[CLONE2]] : !fir.ref<i32>
-!CHECK-DAG: fir.store %[[CLONE_LD2]] to %[[ARG2]] : !fir.ref<i32>
-!CHECK: }
-!CHECK: omp.yield
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-
-subroutine mult_lastprivate_int(arg1, arg2)
-        integer :: arg1, arg2
-!$OMP PARALLEL 
-!$OMP DO LASTPRIVATE(arg1) LASTPRIVATE(arg2)
-do n = 1, 5
-        arg1 = 2
-        arg2 = 3
-        print *, arg1, arg2
-end do
-!$OMP END DO
-!$OMP END PARALLEL
-print *, arg1, arg2
-end subroutine
-
-!CHECK: func.func @_QPmult_lastprivate_int2(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}, %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "arg2"}) {
-!CHECK: omp.parallel  {
-!CHECK-DAG: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1"
-!CHECK-DAG: %[[CLONE2:.*]] = fir.alloca i32 {bindc_name = "arg2"
-!CHECK: omp.wsloop {
-!CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
-
-!Testing last iteration check
-!CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
-!CHECK: %[[C0:.*]] = arith.constant 0 : i32
-!CHECK: %[[T1:.*]] = arith.cmpi slt, %{{.*}}, %[[C0]] : i32
-!CHECK: %[[T2:.*]] = arith.cmpi slt, %[[V]], %{{.*}} : i32
-!CHECK: %[[T3:.*]] = arith.cmpi sgt, %[[V]], %{{.*}} : i32
-!CHECK: %[[IV_CMP:.*]] = arith.select %[[T1]], %[[T2]], %[[T3]] : i1
-!CHECK: fir.if %[[IV_CMP]] {
-!CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
-!Testing lastprivate val update
-!CHECK-DAG: %[[CLONE_LD2:.*]] = fir.load %[[CLONE2]] : !fir.ref<i32>
-!CHECK-DAG: fir.store %[[CLONE_LD2]] to %[[ARG2]] : !fir.ref<i32>
-!CHECK-DAG: %[[CLONE_LD1:.*]] = fir.load %[[CLONE1]] : !fir.ref<i32>
-!CHECK-DAG: fir.store %[[CLONE_LD1]] to %[[ARG1]] : !fir.ref<i32>
-!CHECK: }
-!CHECK: omp.yield
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-
-subroutine mult_lastprivate_int2(arg1, arg2)
-        integer :: arg1, arg2
-!$OMP PARALLEL 
-!$OMP DO LASTPRIVATE(arg1, arg2)
-do n = 1, 5
-        arg1 = 2
-        arg2 = 3
-        print *, arg1, arg2
-end do
-!$OMP END DO
-!$OMP END PARALLEL
-print *, arg1, arg2
-end subroutine
-
-!CHECK: func.func @_QPfirstpriv_lastpriv_int(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}, %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "arg2"}) {
-!CHECK: omp.parallel  {
-! Firstprivate update
-!CHECK-DAG: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1"
-!CHECK-DAG: %[[FPV_LD:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-!CHECK-DAG: fir.store %[[FPV_LD]] to %[[CLONE1]] : !fir.ref<i32>
-! Lastprivate Allocation
-!CHECK-DAG: %[[CLONE2:.*]] = fir.alloca i32 {bindc_name = "arg2"
-!CHECK-NOT: omp.barrier
-!CHECK: omp.wsloop {
-!CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
-
-! Testing last iteration check
-!CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
-!CHECK: %[[C0:.*]] = arith.constant 0 : i32
-!CHECK: %[[T1:.*]] = arith.cmpi slt, %{{.*}}, %[[C0]] : i32
-!CHECK: %[[T2:.*]] = arith.cmpi slt, %[[V]], %{{.*}} : i32
-!CHECK: %[[T3:.*]] = arith.cmpi sgt, %[[V]], %{{.*}} : i32
-!CHECK: %[[IV_CMP:.*]] = arith.select %[[T1]], %[[T2]], %[[T3]] : i1
-!CHECK: fir.if %[[IV_CMP]] {
-!CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
-! Testing lastprivate val update
-!CHECK-NEXT: %[[CLONE_LD:.*]] = fir.load %[[CLONE2]] : !fir.ref<i32>
-!CHECK-NEXT: fir.store %[[CLONE_LD]] to %[[ARG2]] : !fir.ref<i32>
-!CHECK-NEXT: }
-!CHECK-NEXT: omp.yield
-!CHECK-NEXT: }
-!CHECK-NEXT: omp.terminator
-!CHECK-NEXT: }
-
-subroutine firstpriv_lastpriv_int(arg1, arg2)
-        integer :: arg1, arg2
-!$OMP PARALLEL 
-!$OMP DO FIRSTPRIVATE(arg1) LASTPRIVATE(arg2)
-do n = 1, 5
-        arg1 = 2
-        arg2 = 3
-        print *, arg1, arg2
-end do
-!$OMP END DO
-!$OMP END PARALLEL
-print *, arg1, arg2
-end subroutine
-
-!CHECK: func.func @_QPfirstpriv_lastpriv_int2(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}) {
-!CHECK: omp.parallel  {
-! Firstprivate update
-!CHECK: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1"
-!CHECK-NEXT: %[[FPV_LD:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-!CHECK-NEXT: fir.store %[[FPV_LD]] to %[[CLONE1]] : !fir.ref<i32>
-!CHECK-NEXT: omp.barrier
-!CHECK: omp.wsloop {
-!CHECK-NEXT: omp.loop_nest (%[[INDX_WS:.*]]) : {{.*}} {
-! Testing last iteration check
-!CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
-!CHECK: %[[C0:.*]] = arith.constant 0 : i32
-!CHECK: %[[T1:.*]] = arith.cmpi slt, %{{.*}}, %[[C0]] : i32
-!CHECK: %[[T2:.*]] = arith.cmpi slt, %[[V]], %{{.*}} : i32
-!CHECK: %[[T3:.*]] = arith.cmpi sgt, %[[V]], %{{.*}} : i32
-!CHECK: %[[IV_CMP:.*]] = arith.select %[[T1]], %[[T2]], %[[T3]] : i1
-!CHECK: fir.if %[[IV_CMP]] {
-!CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
-! Testing lastprivate val update
-!CHECK-NEXT: %[[CLONE_LD:.*]] = fir.load %[[CLONE1]] : !fir.ref<i32>
-!CHECK-NEXT: fir.store %[[CLONE_LD]] to %[[ARG1]] : !fir.ref<i32>
-!CHECK-NEXT: }
-!CHECK-NEXT: omp.yield
-!CHECK-NEXT: }
-!CHECK-NEXT: omp.terminator
-!CHECK-NEXT: }
-
-subroutine firstpriv_lastpriv_int2(arg1)
-        integer :: arg1
-!$OMP PARALLEL 
-!$OMP DO FIRSTPRIVATE(arg1) LASTPRIVATE(arg1)
-do n = 1, 5
-        arg1 = 2
-        print *, arg1
-end do
-!$OMP END DO
-!$OMP END PARALLEL
-print *, arg1
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-private-clause-fixes.f90 b/flang/test/Lower/OpenMP/FIR/parallel-private-clause-fixes.f90
deleted file mode 100644
index fb0fb9594c35..000000000000
--- a/flang/test/Lower/OpenMP/FIR/parallel-private-clause-fixes.f90
+++ /dev/null
@@ -1,84 +0,0 @@
-! This test checks a few bug fixes in the PRIVATE clause lowering
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-! CHECK-LABEL: multiple_private_fix
-! CHECK:         %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_private_fixEi"}
-! CHECK:         %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFmultiple_private_fixEj"}
-! CHECK:         %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_private_fixEx"}
-! CHECK:         omp.parallel {
-! CHECK-DAG:           %[[PRIV_J:.*]] = fir.alloca i32 {bindc_name = "j", pinned
-! CHECK-DAG:           %[[PRIV_I:.*]] = fir.alloca i32 {adapt.valuebyref, pinned
-! CHECK-DAG:           %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned
-! CHECK:           %[[ONE:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_4:.*]] : !fir.ref<i32>
-! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i32
-! CHECK:           omp.wsloop {
-! CHECK-NEXT:        omp.loop_nest (%[[VAL_6:.*]]) : i32 = (%[[ONE]]) to (%[[VAL_3]]) inclusive step (%[[VAL_5]]) {
-! CHECK:               fir.store %[[VAL_6]] to %[[PRIV_I]] : !fir.ref<i32>
-! CHECK:               %[[VAL_7:.*]] = arith.constant 1 : i32
-! CHECK:               %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i32) -> index
-! CHECK:               %[[VAL_9:.*]] = fir.load %[[VAL_4]] : !fir.ref<i32>
-! CHECK:               %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> index
-! CHECK:               %[[VAL_11:.*]] = arith.constant 1 : index
-! CHECK:               %[[LB:.*]] = fir.convert %[[VAL_8]] : (index) -> i32
-! CHECK:               %[[VAL_12:.*]]:2 = fir.do_loop %[[VAL_13:[^ ]*]] =
-! CHECK-SAME:              %[[VAL_8]] to %[[VAL_10]] step %[[VAL_11]]
-! CHECK-SAME:              iter_args(%[[IV:.*]] = %[[LB]]) -> (index, i32) {
-! CHECK:                 fir.store %[[IV]] to %[[PRIV_J]] : !fir.ref<i32>
-! CHECK:                 %[[LOAD:.*]] = fir.load %[[PRIV_I]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_15:.*]] = fir.load %[[PRIV_J]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_16:.*]] = arith.addi %[[LOAD]], %[[VAL_15]] : i32
-! CHECK:                 fir.store %[[VAL_16]] to %[[PRIV_X]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_17:.*]] = arith.addi %[[VAL_13]], %[[VAL_11]] : index
-! CHECK:                 %[[STEPCAST:.*]] = fir.convert %[[VAL_11]] : (index) -> i32
-! CHECK:                 %[[IVLOAD:.*]] = fir.load %[[PRIV_J]] : !fir.ref<i32>
-! CHECK:                 %[[IVINC:.*]] = arith.addi %[[IVLOAD]], %[[STEPCAST]]
-! CHECK:                 fir.result %[[VAL_17]], %[[IVINC]] : index, i32
-! CHECK:               }
-! CHECK:               fir.store %[[VAL_12]]#1 to %[[PRIV_J]] : !fir.ref<i32>
-! CHECK:               omp.yield
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-subroutine multiple_private_fix(gama)
-        integer :: i, j, x, gama
-!$OMP PARALLEL DO PRIVATE(j,x)
-        do i = 1, gama
-          do j = 1, gama
-            x = i + j
-          end do
-        end do
-!$OMP END PARALLEL DO
-end subroutine
-
-! CHECK-LABEL: multiple_private_fix2
-! CHECK:  %[[X1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_private_fix2Ex"}
-! CHECK:  omp.parallel  {
-! CHECK:    %[[X2:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFmultiple_private_fix2Ex"}
-! CHECK:    omp.parallel  {
-! CHECK:      %[[X3:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFmultiple_private_fix2Ex"}
-! CHECK:      %[[C3:.*]] = arith.constant 1 : i32
-! CHECK:      fir.store %[[C3]] to %[[X3]] : !fir.ref<i32>
-! CHECK:      omp.terminator
-! CHECK:    }
-! CHECK:      %[[C2:.*]] = arith.constant 1 : i32
-! CHECK:      fir.store %[[C2]] to %[[X2]] : !fir.ref<i32>
-! CHECK:    omp.terminator
-! CHECK:  }
-! CHECK:      %[[C1:.*]] = arith.constant 1 : i32
-! CHECK:      fir.store %[[C1]] to %[[X1]] : !fir.ref<i32>
-! CHECK:  return
-subroutine multiple_private_fix2()
-   integer :: x
-   !$omp parallel private(x)
-   !$omp parallel private(x)
-      x = 1
-   !$omp end parallel
-      x = 1
-   !$omp end parallel
-      x = 1
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-private-clause.f90 b/flang/test/Lower/OpenMP/FIR/parallel-private-clause.f90
deleted file mode 100644
index 2e68d25a15ed..000000000000
--- a/flang/test/Lower/OpenMP/FIR/parallel-private-clause.f90
+++ /dev/null
@@ -1,387 +0,0 @@
-! This test checks lowering of OpenMP parallel Directive with
-! `PRIVATE` clause present.
-
-! REQUIRES: shell
-! RUN: bbc --use-desc-for-alloc=false -fopenmp -emit-fir -hlfir=false %s -o - | \
-! RUN:   FileCheck %s --check-prefix=FIRDialect
-
-!FIRDialect: func @_QPprivate_clause(%[[ARG1:.*]]: !fir.ref<i32>{{.*}}, %[[ARG2:.*]]: !fir.ref<!fir.array<10xi32>>{{.*}}, %[[ARG3:.*]]: !fir.boxchar<1>{{.*}}, %[[ARG4:.*]]: !fir.boxchar<1>{{.*}}) {
-!FIRDialect-DAG: %[[ALPHA:.*]] = fir.alloca i32 {{{.*}}, uniq_name = "{{.*}}Ealpha"}
-!FIRDialect-DAG: %[[ALPHA_ARRAY:.*]] = fir.alloca !fir.array<10xi32> {{{.*}}, uniq_name = "{{.*}}Ealpha_array"}
-!FIRDialect-DAG: %[[BETA:.*]] = fir.alloca !fir.char<1,5> {{{.*}}, uniq_name = "{{.*}}Ebeta"}
-!FIRDialect-DAG: %[[BETA_ARRAY:.*]] = fir.alloca !fir.array<10x!fir.char<1,5>> {{{.*}}, uniq_name = "{{.*}}Ebeta_array"}
-
-!FIRDialect-DAG:  omp.parallel {
-!FIRDialect-DAG: %[[ALPHA_PRIVATE:.*]] = fir.alloca i32 {{{.*}}, pinned, uniq_name = "{{.*}}Ealpha"}
-!FIRDialect-DAG: %[[ALPHA_ARRAY_PRIVATE:.*]] = fir.alloca !fir.array<10xi32> {{{.*}}, pinned, uniq_name = "{{.*}}Ealpha_array"}
-!FIRDialect-DAG: %[[BETA_PRIVATE:.*]] = fir.alloca !fir.char<1,5> {{{.*}}, pinned, uniq_name = "{{.*}}Ebeta"}
-!FIRDialect-DAG: %[[BETA_ARRAY_PRIVATE:.*]] = fir.alloca !fir.array<10x!fir.char<1,5>> {{{.*}}, pinned, uniq_name = "{{.*}}Ebeta_array"}
-!FIRDialect-DAG: %[[ARG1_PRIVATE:.*]] = fir.alloca i32 {{{.*}}, pinned, uniq_name = "{{.*}}Earg1"}
-!FIRDialect-DAG: %[[ARG2_ARRAY_PRIVATE:.*]] = fir.alloca !fir.array<10xi32> {{{.*}}, pinned, uniq_name = "{{.*}}Earg2"}
-!FIRDialect-DAG: %[[ARG3_PRIVATE:.*]] = fir.alloca !fir.char<1,5> {{{.*}}, pinned, uniq_name = "{{.*}}Earg3"}
-!FIRDialect-DAG: %[[ARG4_ARRAY_PRIVATE:.*]] = fir.alloca !fir.array<10x!fir.char<1,5>> {{{.*}}, pinned, uniq_name = "{{.*}}Earg4"}
-!FIRDialect:    omp.terminator
-!FIRDialect:  }
-
-subroutine private_clause(arg1, arg2, arg3, arg4)
-
-        integer :: arg1, arg2(10)
-        integer :: alpha, alpha_array(10)
-        character(5) :: arg3, arg4(10)
-        character(5) :: beta, beta_array(10)
-
-!$OMP PARALLEL PRIVATE(alpha, alpha_array, beta, beta_array, arg1, arg2, arg3, arg4)
-        alpha = 1
-        alpha_array = 4
-        beta = "hi"
-        beta_array = "hi"
-        arg1 = 2
-        arg2 = 3
-        arg3 = "world"
-        arg4 = "world"
-!$OMP END PARALLEL
-
-end subroutine
-
-!FIRDialect: func @_QPprivate_clause_scalar() {
-!FIRDialect-DAG:   {{.*}} = fir.alloca !fir.complex<4> {bindc_name = "c", uniq_name = "{{.*}}Ec"}
-!FIRDialect-DAG:   {{.*}} = fir.alloca i8 {bindc_name = "i1", uniq_name = "{{.*}}Ei1"}
-!FIRDialect-DAG:   {{.*}} = fir.alloca i128 {bindc_name = "i16", uniq_name = "{{.*}}Ei16"}
-!FIRDialect-DAG:   {{.*}} = fir.alloca i16 {bindc_name = "i2", uniq_name = "{{.*}}Ei2"}
-!FIRDialect-DAG:   {{.*}} = fir.alloca i32 {bindc_name = "i4", uniq_name = "{{.*}}Ei4"}
-!FIRDialect-DAG:   {{.*}} = fir.alloca i64 {bindc_name = "i8", uniq_name = "{{.*}}Ei8"}
-!FIRDialect-DAG:   {{.*}} = fir.alloca !fir.logical<4> {bindc_name = "l", uniq_name = "{{.*}}El"}
-!FIRDialect-DAG:   {{.*}} = fir.alloca f32 {bindc_name = "r", uniq_name = "{{.*}}Er"}
-
-!FIRDialect:   omp.parallel {
-!FIRDialect-DAG:     {{.*}} = fir.alloca i8 {bindc_name = "i1", pinned, uniq_name = "{{.*}}Ei1"}
-!FIRDialect-DAG:     {{.*}} = fir.alloca i16 {bindc_name = "i2", pinned, uniq_name = "{{.*}}Ei2"}
-!FIRDialect-DAG:     {{.*}} = fir.alloca i32 {bindc_name = "i4", pinned, uniq_name = "{{.*}}Ei4"}
-!FIRDialect-DAG:     {{.*}} = fir.alloca i64 {bindc_name = "i8", pinned, uniq_name = "{{.*}}Ei8"}
-!FIRDialect-DAG:     {{.*}} = fir.alloca i128 {bindc_name = "i16", pinned, uniq_name = "{{.*}}Ei16"}
-!FIRDialect-DAG:     {{.*}} = fir.alloca !fir.complex<4> {bindc_name = "c", pinned, uniq_name = "{{.*}}Ec"}
-!FIRDialect-DAG:     {{.*}} = fir.alloca !fir.logical<4> {bindc_name = "l", pinned, uniq_name = "{{.*}}El"}
-!FIRDialect-DAG:     {{.*}} = fir.alloca f32 {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
-
-subroutine private_clause_scalar()
-
-        integer(kind=1) :: i1
-        integer(kind=2) :: i2
-        integer(kind=4) :: i4
-        integer(kind=8) :: i8
-        integer(kind=16) :: i16
-        complex :: c
-        logical :: l
-        real :: r
-
-!$OMP PARALLEL PRIVATE(i1, i2, i4, i8, i16, c, l, r)
-        print *, i1, i2, i4, i8, i16, c, l, r
-!$OMP END PARALLEL
-
-end subroutine
-
-!FIRDialect: func @_QPprivate_clause_derived_type() {
-!FIRDialect:   {{.*}} = fir.alloca !fir.type<{{.*}}{t_i:i32,t_arr:!fir.array<5xi32>}> {bindc_name = "t", uniq_name = "{{.*}}Et"}
-
-!FIRDialect:   omp.parallel {
-!FIRDialect:     {{.*}} = fir.alloca !fir.type<{{.*}}{t_i:i32,t_arr:!fir.array<5xi32>}> {bindc_name = "t", pinned, uniq_name = "{{.*}}Et"}
-
-subroutine private_clause_derived_type()
-
-        type my_type
-          integer :: t_i
-          integer :: t_arr(5)
-        end type my_type
-        type(my_type) :: t
-
-!$OMP PARALLEL PRIVATE(t)
-        print *, t%t_i
-!$OMP END PARALLEL
-
-end subroutine
-
-!FIRDialect: func @_QPprivate_clause_allocatable() {
-!FIRDialect-DAG:  {{.*}} = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "x", uniq_name = "{{.*}}Ex"}
-!FIRDialect-DAG:  {{.*}} = fir.alloca !fir.heap<i32> {uniq_name = "{{.*}}Ex.addr"}
-!FIRDialect-DAG:  {{.*}} = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "x2", uniq_name = "{{.*}}Ex2"}
-!FIRDialect-DAG:  {{.*}} = fir.alloca !fir.heap<!fir.array<?xi32>> {uniq_name = "{{.*}}Ex2.addr"}
-!FIRDialect-DAG:  {{.*}} = fir.address_of(@{{.*}}Ex3) : !fir.ref<!fir.box<!fir.heap<i32>>>
-!FIRDialect-DAG:  [[TMP8:%.*]] = fir.address_of(@{{.*}}Ex4) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-
-!FIRDialect:   omp.parallel {
-!FIRDialect-DAG:    [[TMP35:%.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "x", pinned, uniq_name = "{{.*}}Ex"}
-!FIRDialect-DAG:    [[TMP39:%.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "x2", pinned, uniq_name = "{{.*}}Ex2"}
-!FIRDialect-DAG:    [[TMP45:%.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "x3", pinned, uniq_name = "{{.*}}Ex3"}
-
-!FIRDialect-DAG:    [[TMP51:%.*]] = fir.load [[TMP8]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-!FIRDialect-DAG:    [[TMP97:%.*]] = fir.load [[TMP8]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-!FIRDialect-DAG:    [[TMP98:%.*]]:3 = fir.box_dims [[TMP97]], {{.*}} : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
-!FIRDialect-DAG:    [[TMP50:%.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "x4", pinned, uniq_name = "{{.*}}Ex4"}
-
-! FIRDialect-DAG:    [[TMP101:%.*]] = fir.allocmem !fir.array<?xi32>, {{.*}} {fir.must_be_heap = true, uniq_name = "{{.*}}Ex4.alloc"}
-! FIRDialect-DAG:    [[TMP102:%.*]] = fir.shape_shift {{.*}}#0, {{.*}} : (index, index) -> !fir.shapeshift<1>
-! FIRDialect-DAG:    [[TMP103:%.*]] = fir.embox [[TMP101]]([[TMP102]]) : (!fir.heap<!fir.array<?xi32>>, !fir.shapeshift<1>) -> !fir.box<!fir.heap<!fir.array<?xi32>>>
-! FIRDialect-DAG:  fir.store [[TMP103]] to [[TMP50]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-
-
-subroutine private_clause_allocatable()
-
-        integer, allocatable :: x, x2(:)
-        integer, allocatable, save :: x3, x4(:)
-
-        print *, x, x2, x3, x4
-
-!$OMP PARALLEL PRIVATE(x, x2, x3, x4)
-        print *, x, x2, x3, x4
-!$OMP END PARALLEL
-
-end subroutine
-
-
-!FIRDialect: func @_QPprivate_clause_real_call_allocatable() {
-!FIRDialect-DAG: {{.*}} = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "x5", uniq_name = "{{.*}}Ex5"}
-!FIRDialect-DAG: {{.*}} = fir.zero_bits !fir.heap<f32>
-!FIRDialect-DAG: {{.*}} = fir.embox %1 : (!fir.heap<f32>) -> !fir.box<!fir.heap<f32>>
-!FIRDialect-DAG: fir.store %2 to %0 : !fir.ref<!fir.box<!fir.heap<f32>>>
-!FIRDialect-DAG: omp.parallel   {
-!FIRDialect-DAG:  [[TMP203:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "x5", pinned, uniq_name = "{{.*}}Ex5"}
-
-!FIRDialect-DAG: fir.if %{{.*}} {
-
-!FIRDialect-DAG:   fir.store %{{.*}} to [[TMP203]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-!FIRDialect-DAG: } else {
-
-!FIRDialect-DAG:   fir.store %{{.*}} to [[TMP203]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-!FIRDialect-DAG: }
-!FIRDialect-DAG: fir.call @_QFprivate_clause_real_call_allocatablePhelper_private_clause_real_call_allocatable([[TMP203]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> ()
-!FIRDialect-DAG: %{{.*}} = fir.load [[TMP203]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-
-!FIRDialect-DAG: fir.if %{{.*}} {
-!FIRDialect-DAG:   %{{.*}} = fir.load [[TMP203]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-
-!FIRDialect-DAG:     fir.store %{{.*}} to [[TMP203]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-!FIRDialect-DAG:   }
-!FIRDialect-DAG:   omp.terminator
-!FIRDialect-DAG:   }
-!FIRDialect-DAG:   return
-!FIRDialect-DAG: }
-
-
-subroutine private_clause_real_call_allocatable
-        real, allocatable :: x5
-        !$omp parallel private(x5)
-            call helper_private_clause_real_call_allocatable(x5)
-        !$omp end parallel
-    contains
-        subroutine helper_private_clause_real_call_allocatable(x6)
-            real, allocatable :: x6
-            print *, allocated(x6)
-        end subroutine
-end subroutine
-
-!FIRDialect:  func.func @_QPincrement_list_items(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>>> {fir.bindc_name = "head"}) {
-!FIRDialect:    {{%.*}} = fir.alloca !fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>> {bindc_name = "p", uniq_name = "_QFincrement_list_itemsEp"}
-!FIRDialect:    omp.parallel   {
-!FIRDialect:      {{%.*}} = fir.alloca !fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>> {bindc_name = "p", pinned, uniq_name = "_QFincrement_list_itemsEp"}
-!FIRDialect:      omp.single   {
-
-!FIRDialect:         omp.terminator
-!FIRDialect:       omp.terminator
-!FIRDialect:    return
-
-subroutine increment_list_items (head)
-  type node
-     integer :: payload
-     type (node), pointer :: next
-  end type node
-
-  type (node), pointer :: head
-  type (node), pointer :: p
-!$omp parallel private(p)
-!$omp single
-  p => head
-  do
-     p => p%next
-     if ( associated (p) .eqv. .false. ) exit
-  end do
-!$omp end single
-!$omp end parallel
-end subroutine increment_list_items
-
-!FIRDialect:  func.func @_QPparallel_pointer() {
-!FIRDialect-DAG: [[PP0:%.*]]  = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "y1", uniq_name = "{{.*}}Ey1"}
-!FIRDialect-DAG: [[PP1:%.*]]  = fir.alloca !fir.ptr<i32> {uniq_name = "{{.*}}Ey1.addr"}
-!FIRDialect-DAG: [[PP2:%.*]]  = fir.zero_bits !fir.ptr<i32>
-!FIRDialect:     fir.store [[PP2]] to [[PP1]] : !fir.ref<!fir.ptr<i32>>
-!FIRDialect-DAG: [[PP3:%.*]]  = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {bindc_name = "y2", uniq_name = "{{.*}}Ey2"}
-
-!FIRDialect:     fir.store %6 to %3 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
-!FIRDialect-DAG: [[PP7:%.*]] = fir.alloca i32 {bindc_name = "z1", fir.target, uniq_name = "{{.*}}Ez1"}
-
-!FIRDialect-DAG: [[PP8:%.*]] = fir.alloca !fir.array<10xi32> {bindc_name = "z2", fir.target, uniq_name = "{{.*}}Ez2"}
-!FIRDialect:     omp.parallel   {
-!FIRDialect-DAG:   [[PP9:%.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "y1", pinned, uniq_name = "{{.*}}Ey1"}
-!FIRDialect-DAG:   [[PP10:%.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {bindc_name = "y2", pinned, uniq_name = "{{.*}}Ey2"}
-!FIRDialect-DAG:   [[PP11:%.*]] = fir.embox [[PP7]] : (!fir.ref<i32>) -> !fir.box<!fir.ptr<i32>>
-!FIRDialect:       fir.store [[PP11]] to [[PP9]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!FIRDialect-DAG:   [[PP12:%.*]] = fir.shape %c{{.*}} : (index) -> !fir.shape<1>
-!FIRDialect-DAG:   [[PP13:%.*]] = fir.embox [[PP8]]([[PP12]]) : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
-!FIRDialect:       fir.store %13 to [[PP10]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
-!FIRDialect:       omp.terminator
-!FIRDialect:     }
-!FIRDialect:   return
-!FIRDialect: }
-
-subroutine parallel_pointer()
-    integer, pointer :: y1, y2(:)
-    integer, target :: z1, z2(10)
-
-!$omp parallel private(y1, y2)
-  y1=>z1
-  y2=>z2
-!$omp end parallel
-end subroutine parallel_pointer
-
-
-!FIRDialect-LABEL: func @_QPsimple_loop_1()
-subroutine simple_loop_1
-  integer :: i
-  real, allocatable :: r;
-  ! FIRDialect:  omp.parallel
-  !$OMP PARALLEL PRIVATE(r)
-  ! FIRDialect:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
-
-  ! FIRDialect:      [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-
-  ! FIRDialect:      %[[WS_LB:.*]] = arith.constant 1 : i32
-  ! FIRDialect:      %[[WS_UB:.*]] = arith.constant 9 : i32
-  ! FIRDialect:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-
-  ! FIRDialect:      omp.wsloop {
-  ! FIRDialect-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-  !$OMP DO
-  do i=1, 9
-  ! FIRDialect:      fir.store %[[I]] to %[[ALLOCA_IV:.*]] : !fir.ref<i32>
-  ! FIRDialect:      %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! FIRDialect:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  ! FIRDialect:      omp.yield
-  ! FIRDialect:      omp.terminator
-  ! FIRDialect:      {{%.*}} = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      fir.if {{%.*}} {
-  ! FIRDialect:      [[LD:%.*]] = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
-  ! FIRDialect:      fir.freemem [[AD]] : !fir.heap<f32>
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  !$OMP END DO
-  ! FIRDialect:  omp.terminator
-  !$OMP END PARALLEL
-end subroutine
-
-!FIRDialect-LABEL: func @_QPsimple_loop_2()
-subroutine simple_loop_2
-  integer :: i
-  real, allocatable :: r;
-  ! FIRDialect:  omp.parallel
-  !$OMP PARALLEL
-  ! FIRDialect:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
-
-  ! FIRDialect:      [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-
-  ! FIRDialect:      %[[WS_LB:.*]] = arith.constant 1 : i32
-  ! FIRDialect:      %[[WS_UB:.*]] = arith.constant 9 : i32
-  ! FIRDialect:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-
-  ! FIRDialect:      omp.wsloop {
-  ! FIRDialect-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-  !$OMP DO PRIVATE(r)
-  do i=1, 9
-  ! FIRDialect:      fir.store %[[I]] to %[[ALLOCA_IV:.*]] : !fir.ref<i32>
-  ! FIRDialect:      %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! FIRDialect:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  ! FIRDialect:      omp.yield
-  ! FIRDialect:      omp.terminator
-  ! FIRDialect:      {{%.*}} = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      fir.if {{%.*}} {
-  ! FIRDialect:      [[LD:%.*]] = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
-  ! FIRDialect:      fir.freemem [[AD]] : !fir.heap<f32>
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  !$OMP END DO
-  ! FIRDialect:  omp.terminator
-  !$OMP END PARALLEL
-end subroutine
-
-!FIRDialect-LABEL: func @_QPsimple_loop_3()
-subroutine simple_loop_3
-  integer :: i
-  real, allocatable :: r;
-  ! FIRDialect:  omp.parallel
-  ! FIRDialect:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
-
-  ! FIRDialect:      [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-
-  ! FIRDialect:      %[[WS_LB:.*]] = arith.constant 1 : i32
-  ! FIRDialect:      %[[WS_UB:.*]] = arith.constant 9 : i32
-  ! FIRDialect:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-
-  ! FIRDialect:      omp.wsloop {
-  ! FIRDialect-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-  !$OMP PARALLEL DO PRIVATE(r)
-  do i=1, 9
-  ! FIRDialect:      fir.store %[[I]] to %[[ALLOCA_IV:.*]] : !fir.ref<i32>
-  ! FIRDialect:      %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! FIRDialect:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  ! FIRDialect:      omp.yield
-  ! FIRDialect:      omp.terminator
-  ! FIRDialect:      {{%.*}} = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      fir.if {{%.*}} {
-  ! FIRDialect:      [[LD:%.*]] = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:      [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
-  ! FIRDialect:      fir.freemem [[AD]] : !fir.heap<f32>
-  ! FIRDialect:      fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  !$OMP END PARALLEL DO
-  ! FIRDialect:  omp.terminator
-end subroutine
-
-!CHECK-LABEL: func @_QPsimd_loop_1()
-subroutine simd_loop_1
-  integer :: i
-  real, allocatable :: r;
-  ! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
-  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-
-  ! FIRDialect:     %[[LB:.*]] = arith.constant 1 : i32
-  ! FIRDialect:     %[[UB:.*]] = arith.constant 9 : i32
-  ! FIRDialect:     %[[STEP:.*]] = arith.constant 1 : i32
-
-  ! FIRDialect: omp.simd {
-  ! FIRDialect-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  !$OMP SIMD PRIVATE(r)
-  do i=1, 9
-  ! FIRDialect:     fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-  ! FIRDialect:     %[[LOAD_IV:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-  ! FIRDialect:     fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  !$OMP END SIMD
-  ! FIRDialect:     omp.yield
-  ! FIRDialect:     {{%.*}} = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:     fir.if {{%.*}} {
-  ! FIRDialect:     [[LD:%.*]] = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! FIRDialect:     [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
-  ! FIRDialect:     fir.freemem [[AD]] : !fir.heap<f32>
-  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-reduction-add-byref.f90 b/flang/test/Lower/OpenMP/FIR/parallel-reduction-add-byref.f90
deleted file mode 100644
index ea45e716ceae..000000000000
--- a/flang/test/Lower/OpenMP/FIR/parallel-reduction-add-byref.f90
+++ /dev/null
@@ -1,117 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s
-
-!CHECK-LABEL: omp.declare_reduction
-!CHECK-SAME: @[[RED_F32_NAME:.*]] : !fir.ref<f32>
-!CHECK-SAME: init {
-!CHECK: ^bb0(%{{.*}}: !fir.ref<f32>):
-!CHECK:  %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32
-!CHECK:  %[[REF:.*]] = fir.alloca f32
-!CHECKL  fir.store [[%C0_1]] to %[[REF]] : !fir.ref<f32>
-!CHECK:  omp.yield(%[[REF]] : !fir.ref<f32>)
-!CHECK: } combiner {
-!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
-!CHECK:  %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
-!CHECK:  %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
-!CHECK:  %[[RES:.*]] = arith.addf %[[LD0]], %[[LD1]] {{.*}}: f32
-!CHECK:  fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
-!CHECK:  omp.yield(%[[ARG0]] : !fir.ref<f32>)
-!CHECK: }
-
-!CHECK-LABEL: omp.declare_reduction
-!CHECK-SAME: @[[RED_I32_NAME:.*]] : !fir.ref<i32>
-!CHECK-SAME: init {
-!CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
-!CHECK:  %[[C0_1:.*]] = arith.constant 0 : i32
-!CHECK:  %[[REF:.*]] = fir.alloca i32
-!CHECKL  fir.store [[%C0_1]] to %[[REF]] : !fir.ref<i32>
-!CHECK:  omp.yield(%[[REF]] : !fir.ref<i32>)
-!CHECK: } combiner {
-!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-!CHECK:  %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-!CHECK:  %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-!CHECK:  %[[RES:.*]] = arith.addi %[[LD0]], %[[LD1]] : i32
-!CHECK:  fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
-!CHECK:  omp.yield(%[[ARG0]] : !fir.ref<i32>)
-!CHECK: }
-
-!CHECK-LABEL: func.func @_QPsimple_int_add
-!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_addEi"}
-!CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
-!CHECK:  fir.store %[[I_START]] to %[[IREF]] : !fir.ref<i32>
-!CHECK:  omp.parallel byref reduction(@[[RED_I32_NAME]] %[[IREF]] -> %[[PRV:.+]] : !fir.ref<i32>) {
-!CHECK:    %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK:    %[[I_INCR:.+]] = arith.constant 1 : i32
-!CHECK:    %[[RES:.+]] = arith.addi %[[LPRV]], %[[I_INCR]]
-!CHECK:    fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
-!CHECK:    omp.terminator
-!CHECK:  }
-!CHECK: return
-subroutine simple_int_add
-    integer :: i
-    i = 0
-
-    !$omp parallel reduction(+:i)
-    i = i + 1
-    !$omp end parallel
-
-    print *, i
-end subroutine
-
-!CHECK-LABEL: func.func @_QPsimple_real_add
-!CHECK:  %[[RREF:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFsimple_real_addEr"}
-!CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
-!CHECK:  fir.store %[[R_START]] to %[[RREF]] : !fir.ref<f32>
-!CHECK:  omp.parallel byref reduction(@[[RED_F32_NAME]] %[[RREF]] -> %[[PRV:.+]] : !fir.ref<f32>) {
-!CHECK:    %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<f32>
-!CHECK:    %[[R_INCR:.+]] = arith.constant 1.500000e+00 : f32
-!CHECK:    %[[RES]] = arith.addf %[[LPRV]], %[[R_INCR]] {{.*}} : f32
-!CHECK:    fir.store %[[RES]] to %[[PRV]] : !fir.ref<f32>
-!CHECK:    omp.terminator
-!CHECK:  }
-!CHECK: return
-subroutine simple_real_add
-    real :: r
-    r = 0.0
-
-    !$omp parallel reduction(+:r)
-    r = r + 1.5
-    !$omp end parallel
-
-    print *, r
-end subroutine
-
-!CHECK-LABEL: func.func @_QPint_real_add
-!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFint_real_addEi"}
-!CHECK:  %[[RREF:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFint_real_addEr"}
-!CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
-!CHECK:  fir.store %[[R_START]] to %[[RREF]] : !fir.ref<f32>
-!CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
-!CHECK:  fir.store %[[I_START]] to %[[IREF]] : !fir.ref<i32>
-!CHECK:  omp.parallel byref reduction(@[[RED_I32_NAME]] %[[IREF]] -> %[[PRV0:.+]] : !fir.ref<i32>, @[[RED_F32_NAME]] %[[RREF]] -> %[[PRV1:.+]] : !fir.ref<f32>) {
-!CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
-!CHECK:    %[[LPRV1:.+]] = fir.load %[[PRV1]] : !fir.ref<f32>
-!CHECK:    %[[RES1:.+]] = arith.addf %[[R_INCR]], %[[LPRV1]] {{.*}} : f32
-!CHECK:    fir.store %[[RES1]] to %[[PRV1]]
-!CHECK:    %[[LPRV0:.+]] = fir.load %[[PRV0]] : !fir.ref<i32>
-!CHECK:    %[[I_INCR:.*]] = arith.constant 3 : i32
-!CHECK:    %[[RES0:.+]] = arith.addi %[[LPRV0]], %[[I_INCR]]
-!CHECK:    fir.store %[[RES0]] to %[[PRV0]]
-!CHECK:    omp.terminator
-!CHECK:  }
-!CHECK: return
-subroutine int_real_add
-    real :: r
-    integer :: i
-
-    r = 0.0
-    i = 0
-
-    !$omp parallel reduction(+:i,r)
-    r = 1.5 + r
-    i = i + 3
-    !$omp end parallel
-
-    print *, r
-    print *, i
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-reduction-add.f90 b/flang/test/Lower/OpenMP/FIR/parallel-reduction-add.f90
deleted file mode 100644
index 3f6d9e647c9b..000000000000
--- a/flang/test/Lower/OpenMP/FIR/parallel-reduction-add.f90
+++ /dev/null
@@ -1,105 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp -o - %s 2>&1 | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-
-!CHECK-LABEL: omp.declare_reduction
-!CHECK-SAME: @[[RED_F32_NAME:.*]] : f32 init {
-!CHECK: ^bb0(%{{.*}}: f32):
-!CHECK:  %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32
-!CHECK:  omp.yield(%[[C0_1]] : f32)
-!CHECK: } combiner {
-!CHECK: ^bb0(%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32):
-!CHECK:  %[[RES:.*]] = arith.addf %[[ARG0]], %[[ARG1]] {{.*}}: f32
-!CHECK:  omp.yield(%[[RES]] : f32)
-!CHECK: }
-
-!CHECK-LABEL: omp.declare_reduction
-!CHECK-SAME: @[[RED_I32_NAME:.*]] : i32 init {
-!CHECK: ^bb0(%{{.*}}: i32):
-!CHECK:  %[[C0_1:.*]] = arith.constant 0 : i32
-!CHECK:  omp.yield(%[[C0_1]] : i32)
-!CHECK: } combiner {
-!CHECK: ^bb0(%[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32):
-!CHECK:  %[[RES:.*]] = arith.addi %[[ARG0]], %[[ARG1]] : i32
-!CHECK:  omp.yield(%[[RES]] : i32)
-!CHECK: }
-
-!CHECK-LABEL: func.func @_QPsimple_int_add
-!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_addEi"}
-!CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
-!CHECK:  fir.store %[[I_START]] to %[[IREF]] : !fir.ref<i32>
-!CHECK:  omp.parallel reduction(@[[RED_I32_NAME]] %[[IREF]] -> %[[PRV:.+]] : !fir.ref<i32>) {
-!CHECK:    %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK:    %[[I_INCR:.+]] = arith.constant 1 : i32
-!CHECK:    %[[RES:.+]] = arith.addi %[[LPRV]], %[[I_INCR]]
-!CHECK:    fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
-!CHECK:    omp.terminator
-!CHECK:  }
-!CHECK: return
-subroutine simple_int_add
-    integer :: i
-    i = 0
-
-    !$omp parallel reduction(+:i)
-    i = i + 1
-    !$omp end parallel
-
-    print *, i
-end subroutine
-
-!CHECK-LABEL: func.func @_QPsimple_real_add
-!CHECK:  %[[RREF:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFsimple_real_addEr"}
-!CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
-!CHECK:  fir.store %[[R_START]] to %[[RREF]] : !fir.ref<f32>
-!CHECK:  omp.parallel reduction(@[[RED_F32_NAME]] %[[RREF]] -> %[[PRV:.+]] : !fir.ref<f32>) {
-!CHECK:    %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<f32>
-!CHECK:    %[[R_INCR:.+]] = arith.constant 1.500000e+00 : f32
-!CHECK:    %[[RES]] = arith.addf %[[LPRV]], %[[R_INCR]] {{.*}} : f32
-!CHECK:    fir.store %[[RES]] to %[[PRV]] : !fir.ref<f32>
-!CHECK:    omp.terminator
-!CHECK:  }
-!CHECK: return
-subroutine simple_real_add
-    real :: r
-    r = 0.0
-
-    !$omp parallel reduction(+:r)
-    r = r + 1.5
-    !$omp end parallel
-
-    print *, r
-end subroutine
-
-!CHECK-LABEL: func.func @_QPint_real_add
-!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFint_real_addEi"}
-!CHECK:  %[[RREF:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFint_real_addEr"}
-!CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
-!CHECK:  fir.store %[[R_START]] to %[[RREF]] : !fir.ref<f32>
-!CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
-!CHECK:  fir.store %[[I_START]] to %[[IREF]] : !fir.ref<i32>
-!CHECK:  omp.parallel reduction(@[[RED_I32_NAME]] %[[IREF]] -> %[[PRV0:.+]] : !fir.ref<i32>, @[[RED_F32_NAME]] %[[RREF]] -> %[[PRV1:.+]] : !fir.ref<f32>) {
-!CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
-!CHECK:    %[[LPRV1:.+]] = fir.load %[[PRV1]] : !fir.ref<f32>
-!CHECK:    %[[RES1:.+]] = arith.addf %[[R_INCR]], %[[LPRV1]] {{.*}} : f32
-!CHECK:    fir.store %[[RES1]] to %[[PRV1]]
-!CHECK:    %[[LPRV0:.+]] = fir.load %[[PRV0]] : !fir.ref<i32>
-!CHECK:    %[[I_INCR:.*]] = arith.constant 3 : i32
-!CHECK:    %[[RES0:.+]] = arith.addi %[[LPRV0]], %[[I_INCR]]
-!CHECK:    fir.store %[[RES0]] to %[[PRV0]]
-!CHECK:    omp.terminator
-!CHECK:  }
-!CHECK: return
-subroutine int_real_add
-    real :: r
-    integer :: i
-
-    r = 0.0
-    i = 0
-
-    !$omp parallel reduction(+:i,r)
-    r = 1.5 + r
-    i = i + 3
-    !$omp end parallel
-
-    print *, r
-    print *, i
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-sections.f90 b/flang/test/Lower/OpenMP/FIR/parallel-sections.f90
deleted file mode 100644
index 7730ab87a719..000000000000
--- a/flang/test/Lower/OpenMP/FIR/parallel-sections.f90
+++ /dev/null
@@ -1,65 +0,0 @@
-! REQUIRES: openmp_runtime
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes="FIRDialect,OMPDialect"
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --cfg-conversion | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefixes="OMPDialect,LLVMDialect"
-
-!===============================================================================
-! Parallel sections construct
-!===============================================================================
-
-!FIRDialect: func @_QPomp_parallel_sections
-subroutine omp_parallel_sections(x, y)
-  integer, intent(inout) :: x, y
-  !OMPDialect: omp.parallel {
-  !OMPDialect: omp.sections {
-  !$omp parallel sections
-    !OMPDialect: omp.section {
-    !$omp section
-      !FIRDialect: fir.load
-      !FIRDialect: arith.addi
-      !FIRDialect: fir.store
-      x = x + 12
-      !OMPDialect: omp.terminator
-    !OMPDialect: omp.section {
-    !$omp section
-      !FIRDialect: fir.load
-      !FIRDialect: arith.subi
-      !FIRDialect: fir.store
-      y = y - 5
-      !OMPDialect: omp.terminator
-  !OMPDialect: omp.terminator
-  !OMPDialect: omp.terminator
-  !$omp end parallel sections
-end subroutine omp_parallel_sections
-
-!===============================================================================
-! Parallel sections construct with allocate clause
-!===============================================================================
-
-!FIRDialect: func @_QPomp_parallel_sections
-subroutine omp_parallel_sections_allocate(x, y)
-  use omp_lib
-  integer, intent(inout) :: x, y
-  !FIRDialect: %[[allocator_1:.*]] = arith.constant 4 : i64
-  !FIRDialect: %[[allocator_2:.*]] = arith.constant 4 : i64
-  !LLVMDialect: %[[allocator_1:.*]] = llvm.mlir.constant(4 : i64) : i64
-  !LLVMDialect: %[[allocator_2:.*]] = llvm.mlir.constant(4 : i64) : i64
-  !OMPDialect: omp.parallel allocate(
-  !FIRDialect: %[[allocator_2]] : i64 -> %{{.*}} : !fir.ref<i32>) {
-  !LLVMDialect: %[[allocator_2]] : i64 -> %{{.*}} : !llvm.ptr) {
-  !OMPDialect: omp.sections allocate(
-  !FIRDialect: %[[allocator_1]] : i64 -> %{{.*}} : !fir.ref<i32>) {
-  !LLVMDialect: %[[allocator_1]] : i64 -> %{{.*}} : !llvm.ptr) {
-  !$omp parallel sections allocate(omp_high_bw_mem_alloc: x)
-    !OMPDialect: omp.section {
-    !$omp section
-      x = x + 12
-      !OMPDialect: omp.terminator
-    !OMPDialect: omp.section {
-    !$omp section
-      y = y + 5
-      !OMPDialect: omp.terminator
-  !OMPDialect: omp.terminator
-  !OMPDialect: omp.terminator
-  !$omp end parallel sections
-end subroutine omp_parallel_sections_allocate
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-wsloop-firstpriv.f90 b/flang/test/Lower/OpenMP/FIR/parallel-wsloop-firstpriv.f90
deleted file mode 100644
index 490f6d0cf7bc..000000000000
--- a/flang/test/Lower/OpenMP/FIR/parallel-wsloop-firstpriv.f90
+++ /dev/null
@@ -1,69 +0,0 @@
-! This test checks lowering of OpenMP parallel DO, with the loop bound being
-! a firstprivate variable
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-! CHECK: func @_QPomp_do_firstprivate(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}) 
-subroutine omp_do_firstprivate(a)
-  integer::a
-  integer::n
-  n = a+1
-  !$omp parallel do firstprivate(a)
-  ! CHECK:  omp.parallel {
-  ! CHECK-NEXT: %[[REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-  ! CHECK-NEXT: %[[CLONE:.*]] = fir.alloca i32 {bindc_name = "a", pinned
-  ! CHECK-NEXT: %[[LD:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-  ! CHECK-NEXT: fir.store %[[LD]] to %[[CLONE]] : !fir.ref<i32>
-  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: %[[UB:.*]] = fir.load %[[CLONE]] : !fir.ref<i32>
-  ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: omp.wsloop {
-  ! CHECK-NEXT: omp.loop_nest (%[[ARG1:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  ! CHECK-NEXT: fir.store %[[ARG1]] to %[[REF]] : !fir.ref<i32>
-  ! CHECK-NEXT: fir.call @_QPfoo(%[[REF]], %[[CLONE]]) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
-  ! CHECK-NEXT: omp.yield
-  ! CHECK-NEXT: }
-  ! CHECK-NEXT: omp.terminator
-  ! CHECK-NEXT: }
-    do i=1, a
-      call foo(i, a)
-    end do
-  !$omp end parallel do
-  !CHECK: fir.call @_QPbar(%[[ARG0]]) {{.*}}: (!fir.ref<i32>) -> ()
-  call bar(a)
-end subroutine omp_do_firstprivate
-
-! CHECK: func @_QPomp_do_firstprivate2(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) 
-subroutine omp_do_firstprivate2(a, n)
-  integer::a
-  integer::n
-  n = a+1
-  !$omp parallel do firstprivate(a, n)
-  ! CHECK:  omp.parallel {
-  ! CHECK-NEXT: %[[REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-  ! CHECK-NEXT: %[[CLONE:.*]] = fir.alloca i32 {bindc_name = "a", pinned
-  ! CHECK-NEXT: %[[LD:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-  ! CHECK-NEXT: fir.store %[[LD]] to %[[CLONE]] : !fir.ref<i32>
-  ! CHECK-NEXT: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "n", pinned
-  ! CHECK-NEXT: %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-  ! CHECK-NEXT: fir.store %[[LD1]] to %[[CLONE1]] : !fir.ref<i32>
-
-
-  ! CHECK: %[[LB:.*]] = fir.load %[[CLONE]] : !fir.ref<i32>
-  ! CHECK-NEXT: %[[UB:.*]] = fir.load %[[CLONE1]] : !fir.ref<i32>
-  ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: omp.wsloop {
-  ! CHECK-NEXT: omp.loop_nest (%[[ARG2:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  ! CHECK-NEXT: fir.store %[[ARG2]] to %[[REF]] : !fir.ref<i32>
-  ! CHECK-NEXT: fir.call @_QPfoo(%[[REF]], %[[CLONE]]) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
-  ! CHECK-NEXT: omp.yield
-  ! CHECK-NEXT: }
-  ! CHECK-NEXT: omp.terminator
-  ! CHECK-NEXT: }
-    do i= a, n
-      call foo(i, a)
-    end do
-  !$omp end parallel do
-  !CHECK: fir.call @_QPbar(%[[ARG1]]) {{.*}}: (!fir.ref<i32>) -> ()
-  call bar(n)
-end subroutine omp_do_firstprivate2
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-wsloop.f90 b/flang/test/Lower/OpenMP/FIR/parallel-wsloop.f90
deleted file mode 100644
index 630d647bc64b..000000000000
--- a/flang/test/Lower/OpenMP/FIR/parallel-wsloop.f90
+++ /dev/null
@@ -1,297 +0,0 @@
-! This test checks lowering of OpenMP DO Directive (Worksharing).
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-! CHECK-LABEL: func @_QPsimple_parallel_do()
-subroutine simple_parallel_do
-  integer :: i
-  ! CHECK:  omp.parallel
-  ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
-  ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
-  ! CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-  ! CHECK:      omp.wsloop {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-  !$OMP PARALLEL DO
-  do i=1, 9
-  ! CHECK:      fir.store %[[I]] to %[[IV_ADDR:.*]] : !fir.ref<i32>
-  ! CHECK:      %[[LOAD_IV:.*]] = fir.load %[[IV_ADDR]] : !fir.ref<i32>
-  ! CHECK:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  ! CHECK:      omp.yield
-  ! CHECK:      omp.terminator
-  ! CHECK:      omp.terminator
-  !$OMP END PARALLEL DO
-end subroutine
-
-! CHECK-LABEL: func @_QPparallel_do_with_parallel_clauses
-! CHECK-SAME: %[[COND_REF:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}, %[[NT_REF:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}
-subroutine parallel_do_with_parallel_clauses(cond, nt)
-  logical :: cond
-  integer :: nt
-  integer :: i
-  ! CHECK:  %[[COND:.*]] = fir.load %[[COND_REF]] : !fir.ref<!fir.logical<4>>
-  ! CHECK:  %[[COND_CVT:.*]] = fir.convert %[[COND]] : (!fir.logical<4>) -> i1
-  ! CHECK:  %[[NT:.*]] = fir.load %[[NT_REF]] : !fir.ref<i32>
-  ! CHECK:  omp.parallel if(%[[COND_CVT]] : i1) num_threads(%[[NT]] : i32) proc_bind(close)
-  ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
-  ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
-  ! CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-  ! CHECK:      omp.wsloop {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-  !$OMP PARALLEL DO IF(cond) NUM_THREADS(nt) PROC_BIND(close)
-  do i=1, 9
-  ! CHECK:      fir.store %[[I]] to %[[IV_ADDR:.*]] : !fir.ref<i32>
-  ! CHECK:      %[[LOAD_IV:.*]] = fir.load %[[IV_ADDR]] : !fir.ref<i32>
-  ! CHECK:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  ! CHECK:      omp.yield
-  ! CHECK:      omp.terminator
-  ! CHECK:      omp.terminator
-  !$OMP END PARALLEL DO
-end subroutine
-
-! CHECK-LABEL: func @_QPparallel_do_with_clauses
-! CHECK-SAME: %[[NT_REF:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}
-subroutine parallel_do_with_clauses(nt)
-  integer :: nt
-  integer :: i
-  ! CHECK:  %[[NT:.*]] = fir.load %[[NT_REF]] : !fir.ref<i32>
-  ! CHECK:  omp.parallel num_threads(%[[NT]] : i32)
-  ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
-  ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
-  ! CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-  ! CHECK:      omp.wsloop schedule(dynamic) {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
-  !$OMP PARALLEL DO NUM_THREADS(nt) SCHEDULE(dynamic)
-  do i=1, 9
-  ! CHECK:      fir.store %[[I]] to %[[IV_ADDR:.*]] : !fir.ref<i32>
-  ! CHECK:      %[[LOAD_IV:.*]] = fir.load %[[IV_ADDR]] : !fir.ref<i32>
-  ! CHECK:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  ! CHECK:      omp.yield
-  ! CHECK:      omp.terminator
-  ! CHECK:      omp.terminator
-  !$OMP END PARALLEL DO
-end subroutine
-
-!===============================================================================
-! Checking for the following construct:
-!   !$omp parallel do private(...) firstprivate(...)
-!===============================================================================
-
-! CHECK-LABEL: func @_QPparallel_do_with_privatisation_clauses
-! CHECK-SAME: %[[COND_REF:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}, %[[NT_REF:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}
-subroutine parallel_do_with_privatisation_clauses(cond,nt)
-  logical :: cond
-  integer :: nt
-  integer :: i
-  ! CHECK:  omp.parallel
-  ! CHECK:      %[[PRIVATE_COND_REF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "cond", pinned, uniq_name = "_QFparallel_do_with_privatisation_clausesEcond"}
-  ! CHECK:      %[[PRIVATE_NT_REF:.*]] = fir.alloca i32 {bindc_name = "nt", pinned, uniq_name = "_QFparallel_do_with_privatisation_clausesEnt"}
-  ! CHECK:      %[[NT_VAL:.*]] = fir.load %[[NT_REF]] : !fir.ref<i32>
-  ! CHECK:      fir.store %[[NT_VAL]] to %[[PRIVATE_NT_REF]] : !fir.ref<i32>
-  ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
-  ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
-  ! CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-  ! CHECK:      omp.wsloop {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-  !$OMP PARALLEL DO PRIVATE(cond) FIRSTPRIVATE(nt)
-  do i=1, 9
-  ! CHECK:      fir.store %[[I]] to %[[IV_ADDR:.*]] : !fir.ref<i32>
-  ! CHECK:      %[[LOAD_IV:.*]] = fir.load %[[IV_ADDR]] : !fir.ref<i32>
-  ! CHECK:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-  ! CHECK:      %[[PRIVATE_COND_VAL:.*]] = fir.load %[[PRIVATE_COND_REF]] : !fir.ref<!fir.logical<4>>
-  ! CHECK:      %[[PRIVATE_COND_VAL_CVT:.*]] = fir.convert %[[PRIVATE_COND_VAL]] : (!fir.logical<4>) -> i1
-  ! CHECK:      fir.call @_FortranAioOutputLogical({{.*}}, %[[PRIVATE_COND_VAL_CVT]]) {{.*}}: (!fir.ref<i8>, i1) -> i1
-  ! CHECK:      %[[PRIVATE_NT_VAL:.*]] = fir.load %[[PRIVATE_NT_REF]] : !fir.ref<i32>
-  ! CHECK:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[PRIVATE_NT_VAL]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i, cond, nt
-  end do
-  ! CHECK:      omp.yield
-  ! CHECK:      omp.terminator
-  ! CHECK:      omp.terminator
-  !$OMP END PARALLEL DO
-end subroutine
-
-!===============================================================================
-! Checking for the following construct
-!   !$omp parallel private(...) firstprivate(...)
-!   !$omp do
-!===============================================================================
-
-subroutine parallel_private_do(cond,nt)
-logical :: cond
-  integer :: nt
-  integer :: i
-  !$OMP PARALLEL PRIVATE(cond) FIRSTPRIVATE(nt)
-  !$OMP DO
-  do i=1, 9
-    call foo(i, cond, nt)
-  end do
-  !$OMP END DO
-  !$OMP END PARALLEL
-end subroutine parallel_private_do
-
-! CHECK-LABEL:   func.func @_QPparallel_private_do(
-! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"},
-! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}) {
-! CHECK:           %[[I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFparallel_private_doEi"}
-! CHECK:           omp.parallel   {
-! CHECK:             %[[I_PRIV:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[COND_ADDR:.*]] = fir.alloca !fir.logical<4> {bindc_name = "cond", pinned, uniq_name = "_QFparallel_private_doEcond"}
-! CHECK:             %[[NT_ADDR:.*]] = fir.alloca i32 {bindc_name = "nt", pinned, uniq_name = "_QFparallel_private_doEnt"}
-! CHECK:             %[[NT:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
-! CHECK:             fir.store %[[NT]] to %[[NT_ADDR]] : !fir.ref<i32>
-! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_8:.*]] = arith.constant 9 : i32
-! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop {
-! CHECK-NEXT:          omp.loop_nest (%[[I:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
-! CHECK:                 fir.store %[[I]] to %[[I_PRIV]] : !fir.ref<i32>
-! CHECK:                 fir.call @_QPfoo(%[[I_PRIV]], %[[COND_ADDR]], %[[NT_ADDR]]) {{.*}}: (!fir.ref<i32>, !fir.ref<!fir.logical<4>>, !fir.ref<i32>) -> ()
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-!===============================================================================
-! Checking for the following construct
-!   !$omp parallel
-!   !$omp do firstprivate(...) firstprivate(...)
-!===============================================================================
-
-subroutine omp_parallel_multiple_firstprivate_do(a, b)
-  integer::a, b
-  !$OMP PARALLEL FIRSTPRIVATE(a) FIRSTPRIVATE(b)
-  !$OMP DO
-  do i=1, 10
-    call bar(i, a)
-  end do
-  !$OMP END DO
-  !$OMP END PARALLEL
-end subroutine omp_parallel_multiple_firstprivate_do
-
-! CHECK-LABEL:   func.func @_QPomp_parallel_multiple_firstprivate_do(
-! CHECK-SAME:                                                        %[[A_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
-! CHECK-SAME:                                                        %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) {
-! CHECK:           %[[I_ADDR:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_parallel_multiple_firstprivate_doEi"}
-! CHECK:           omp.parallel   {
-! CHECK:             %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[A_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFomp_parallel_multiple_firstprivate_doEa"}
-! CHECK:             %[[A:.*]] = fir.load %[[A_ADDR]] : !fir.ref<i32>
-! CHECK:             fir.store %[[A]] to %[[A_PRIV_ADDR]] : !fir.ref<i32>
-! CHECK:             %[[B_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "b", pinned, uniq_name = "_QFomp_parallel_multiple_firstprivate_doEb"}
-! CHECK:             %[[B:.*]] = fir.load %[[B_ADDR]] : !fir.ref<i32>
-! CHECK:             fir.store %[[B]] to %[[B_PRIV_ADDR]] : !fir.ref<i32>
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_9:.*]] = arith.constant 10 : i32
-! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop {
-! CHECK-NEXT:          omp.loop_nest (%[[I:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
-! CHECK:                 fir.store %[[I]] to %[[I_PRIV_ADDR]] : !fir.ref<i32>
-! CHECK:                 fir.call @_QPbar(%[[I_PRIV_ADDR]], %[[A_PRIV_ADDR]]) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-!===============================================================================
-! Checking for the following construct
-!   !$omp parallel
-!   !$omp do private(...) firstprivate(...)
-!===============================================================================
-
-subroutine parallel_do_private(cond,nt)
-logical :: cond
-  integer :: nt
-  integer :: i
-  !$OMP PARALLEL
-  !$OMP DO PRIVATE(cond) FIRSTPRIVATE(nt)
-  do i=1, 9
-    call foo(i, cond, nt)
-  end do
-  !$OMP END DO
-  !$OMP END PARALLEL
-end subroutine parallel_do_private
-
-! CHECK-LABEL:   func.func @_QPparallel_do_private(
-! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"},
-! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}) {
-! CHECK:           %[[I_ADDR:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFparallel_do_privateEi"}
-! CHECK:           omp.parallel   {
-! CHECK:             %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[COND_ADDR:.*]] = fir.alloca !fir.logical<4> {bindc_name = "cond", pinned, uniq_name = "_QFparallel_do_privateEcond"}
-! CHECK:             %[[NT_ADDR:.*]] = fir.alloca i32 {bindc_name = "nt", pinned, uniq_name = "_QFparallel_do_privateEnt"}
-! CHECK:             %[[NT:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
-! CHECK:             fir.store %[[NT]] to %[[NT_ADDR]] : !fir.ref<i32>
-! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_8:.*]] = arith.constant 9 : i32
-! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop {
-! CHECK-NEXT:          omp.loop_nest (%[[I:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
-! CHECK:                 fir.store %[[I]] to %[[I_PRIV_ADDR]] : !fir.ref<i32>
-! CHECK:                 fir.call @_QPfoo(%[[I_PRIV_ADDR]], %[[COND_ADDR]], %[[NT_ADDR]]) {{.*}}: (!fir.ref<i32>, !fir.ref<!fir.logical<4>>, !fir.ref<i32>) -> ()
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-!===============================================================================
-! Checking for the following construct
-!   !$omp parallel
-!   !$omp do firstprivate(...) firstprivate(...)
-!===============================================================================
-
-subroutine omp_parallel_do_multiple_firstprivate(a, b)
-  integer::a, b
-  !$OMP PARALLEL
-  !$OMP DO FIRSTPRIVATE(a) FIRSTPRIVATE(b)
-  do i=1, 10
-    call bar(i, a)
-  end do
-  !$OMP END DO
-  !$OMP END PARALLEL
-end subroutine omp_parallel_do_multiple_firstprivate
-
-! CHECK-LABEL:   func.func @_QPomp_parallel_do_multiple_firstprivate(
-! CHECK-SAME:                                                        %[[A_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
-! CHECK-SAME:                                                        %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) {
-! CHECK:           %[[I_ADDR:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_parallel_do_multiple_firstprivateEi"}
-! CHECK:           omp.parallel   {
-! CHECK:             %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[A_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFomp_parallel_do_multiple_firstprivateEa"}
-! CHECK:             %[[A:.*]] = fir.load %[[A_ADDR]] : !fir.ref<i32>
-! CHECK:             fir.store %[[A]] to %[[A_PRIV_ADDR]] : !fir.ref<i32>
-! CHECK:             %[[B_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "b", pinned, uniq_name = "_QFomp_parallel_do_multiple_firstprivateEb"}
-! CHECK:             %[[B:.*]] = fir.load %[[B_ADDR]] : !fir.ref<i32>
-! CHECK:             fir.store %[[B]] to %[[B_PRIV_ADDR]] : !fir.ref<i32>
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_9:.*]] = arith.constant 10 : i32
-! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop {
-! CHECK-NEXT:          omp.loop_nest (%[[I:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
-! CHECK:                 fir.store %[[I]] to %[[I_PRIV_ADDR]] : !fir.ref<i32>
-! CHECK:                 fir.call @_QPbar(%[[I_PRIV_ADDR]], %[[A_PRIV_ADDR]]) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
diff --git a/flang/test/Lower/OpenMP/FIR/parallel.f90 b/flang/test/Lower/OpenMP/FIR/parallel.f90
deleted file mode 100644
index a2ceb2d939f2..000000000000
--- a/flang/test/Lower/OpenMP/FIR/parallel.f90
+++ /dev/null
@@ -1,211 +0,0 @@
-! REQUIRES: openmp_runtime
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes="FIRDialect,OMPDialect"
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefixes="LLVMDialect,OMPDialect"
-
-!FIRDialect-LABEL: func @_QPparallel_simple
-subroutine parallel_simple()
-   !OMPDialect: omp.parallel
-!$omp parallel
-   !FIRDialect: fir.call
-   call f1()
-!$omp end parallel
-end subroutine parallel_simple
-
-!===============================================================================
-! `if` clause
-!===============================================================================
-
-!FIRDialect-LABEL: func @_QPparallel_if
-subroutine parallel_if(alpha, beta, gamma)
-   integer, intent(in) :: alpha
-   logical, intent(in) :: beta
-   logical(1) :: logical1
-   logical(2) :: logical2
-   logical(4) :: logical4
-   logical(8) :: logical8
-
-   !OMPDialect: omp.parallel if(%{{.*}} : i1) {
-   !$omp parallel if(alpha .le. 0)
-   !FIRDialect: fir.call
-   call f1()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel if(%{{.*}} : i1) {
-   !$omp parallel if(.false.)
-   !FIRDialect: fir.call
-   call f2()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel if(%{{.*}} : i1) {
-   !$omp parallel if(alpha .ge. 0)
-   !FIRDialect: fir.call
-   call f3()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel if(%{{.*}} : i1) {
-   !$omp parallel if(.true.)
-   !FIRDialect: fir.call
-   call f4()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel if(%{{.*}} : i1) {
-   !$omp parallel if(beta)
-   !FIRDialect: fir.call
-   call f1()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel if(%{{.*}} : i1) {
-   !$omp parallel if(logical1)
-   !FIRDialect: fir.call
-   call f1()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel if(%{{.*}} : i1) {
-   !$omp parallel if(logical2)
-   !FIRDialect: fir.call
-   call f1()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel if(%{{.*}} : i1) {
-   !$omp parallel if(logical4)
-   !FIRDialect: fir.call
-   call f1()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel if(%{{.*}} : i1) {
-   !$omp parallel if(logical8)
-   !FIRDialect: fir.call
-   call f1()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-end subroutine parallel_if
-
-!===============================================================================
-! `num_threads` clause
-!===============================================================================
-
-!FIRDialect-LABEL: func @_QPparallel_numthreads
-subroutine parallel_numthreads(num_threads)
-   integer, intent(inout) :: num_threads
-
-   !OMPDialect: omp.parallel num_threads(%{{.*}}: i32) {
-   !$omp parallel num_threads(16)
-   !FIRDialect: fir.call
-   call f1()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   num_threads = 4
-
-   !OMPDialect: omp.parallel num_threads(%{{.*}} : i32) {
-   !$omp parallel num_threads(num_threads)
-   !FIRDialect: fir.call
-   call f2()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-end subroutine parallel_numthreads
-
-!===============================================================================
-! `proc_bind` clause
-!===============================================================================
-
-!FIRDialect-LABEL: func @_QPparallel_proc_bind
-subroutine parallel_proc_bind()
-
-   !OMPDialect: omp.parallel proc_bind(master) {
-   !$omp parallel proc_bind(master)
-   !FIRDialect: fir.call
-   call f1()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel proc_bind(close) {
-   !$omp parallel proc_bind(close)
-   !FIRDialect: fir.call
-   call f2()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel proc_bind(spread) {
-   !$omp parallel proc_bind(spread)
-   !FIRDialect: fir.call
-   call f3()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-end subroutine parallel_proc_bind
-
-!===============================================================================
-! `allocate` clause
-!===============================================================================
-
-!FIRDialect-LABEL: func @_QPparallel_allocate
-subroutine parallel_allocate()
-   use omp_lib
-   integer :: x
-   !OMPDialect: omp.parallel allocate(
-   !FIRDialect: %{{.+}} : i64 -> %{{.+}} : !fir.ref<i32>
-   !LLVMDialect: %{{.+}} : i64 -> %{{.+}} : !llvm.ptr
-   !OMPDialect: ) {
-   !$omp parallel allocate(omp_high_bw_mem_alloc: x) private(x)
-   !FIRDialect: arith.addi
-   x = x + 12
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-end subroutine parallel_allocate
-
-!===============================================================================
-! multiple clauses
-!===============================================================================
-
-!FIRDialect-LABEL: func @_QPparallel_multiple_clauses
-subroutine parallel_multiple_clauses(alpha, num_threads)
-   use omp_lib
-   integer, intent(inout) :: alpha
-   integer, intent(in) :: num_threads
-
-   !OMPDialect: omp.parallel if({{.*}} : i1) proc_bind(master) {
-   !$omp parallel if(alpha .le. 0) proc_bind(master)
-   !FIRDialect: fir.call
-   call f1()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel num_threads({{.*}} : i32) proc_bind(close) {
-   !$omp parallel proc_bind(close) num_threads(num_threads)
-   !FIRDialect: fir.call
-   call f2()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel if({{.*}} : i1) num_threads({{.*}} : i32) {
-   !$omp parallel num_threads(num_threads) if(alpha .le. 0)
-   !FIRDialect: fir.call
-   call f3()
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-   !OMPDialect: omp.parallel if({{.*}} : i1) num_threads({{.*}} : i32) allocate(
-   !FIRDialect: %{{.+}} : i64 -> %{{.+}} : !fir.ref<i32>
-   !LLVMDialect: %{{.+}} : i64 -> %{{.+}} : !llvm.ptr
-   !OMPDialect: ) {
-   !$omp parallel num_threads(num_threads) if(alpha .le. 0) allocate(omp_high_bw_mem_alloc: alpha) private(alpha)
-   !FIRDialect: fir.call
-   call f3()
-   !FIRDialect: arith.addi
-   alpha = alpha + 12
-   !OMPDialect: omp.terminator
-   !$omp end parallel
-
-end subroutine parallel_multiple_clauses
diff --git a/flang/test/Lower/OpenMP/FIR/pre-fir-tree-loop.f90 b/flang/test/Lower/OpenMP/FIR/pre-fir-tree-loop.f90
deleted file mode 100644
index eca8fb304986..000000000000
--- a/flang/test/Lower/OpenMP/FIR/pre-fir-tree-loop.f90
+++ /dev/null
@@ -1,70 +0,0 @@
-! RUN: bbc -fopenmp -pft-test -o %t %s | FileCheck %s
-! RUN: %flang_fc1 -fopenmp -fdebug-dump-pft -o %t %s | FileCheck %s
-
-! Loop constructs always have an `end do` which can be the target of
-! a branch. So OpenMP loop constructs do not need an artificial
-! continue inserted for a target.
-
-!CHECK-LABEL: sb0
-!CHECK-NOT: continue
-subroutine sb0(cond)
-  implicit none
-  logical :: cond
-  integer :: i
-  !$omp parallel do
-  do i = 1, 20
-    if( cond) then
-      cycle
-    end if
-  end do
-  return
-end subroutine
-
-!CHECK-LABEL: sb1
-!CHECK-NOT: continue
-subroutine sb1(cond)
-  implicit none
-  logical :: cond
-  integer :: i
-  !$omp parallel do
-  do i = 1, 20
-    if( cond) then
-      cycle
-    end if
-  end do
-  !$omp end parallel do
-  return
-end subroutine
-
-!CHECK-LABEL: sb2
-!CHECK-NOT: continue
-subroutine sb2
-  integer :: i, n
-  integer :: tmp
-
-  !$omp parallel do
-  do ifld=1,n
-     do isum=1,n
-       if (tmp > n) then
-         exit
-       endif
-     enddo
-     tmp = n
-  enddo
-end subroutine
-
-!CHECK-LABEL: sb3
-!CHECK-NOT: continue
-subroutine sb3
-  integer :: i, n
-  integer :: tmp
-
-  !$omp parallel do
-  do ifld=1,n
-     do isum=1,n
-       if (tmp > n) then
-         exit
-       endif
-     enddo
-  enddo
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/pre-fir-tree01.f90 b/flang/test/Lower/OpenMP/FIR/pre-fir-tree01.f90
deleted file mode 100644
index fc817942513e..000000000000
--- a/flang/test/Lower/OpenMP/FIR/pre-fir-tree01.f90
+++ /dev/null
@@ -1,19 +0,0 @@
-! RUN: bbc -fopenmp -pft-test -o %t %s | FileCheck %s
-! RUN: %flang_fc1 -fopenmp -fdebug-dump-pft -o %t %s | FileCheck %s
-
-! Test structure of the Pre-FIR tree with OpenMP
-
-subroutine sub1(a, b, n)
-  real :: a(:), b(:)
-  integer :: n, i
-  !$omp parallel do
-  do i = 1, n
-    b(i) = exp(a(i))
-  end do
-  !$omp end parallel do
-end subroutine
-
-! CHECK-LABEL: Subroutine sub1
-! CHECK:       <<OpenMPConstruct>>
-! CHECK:       <<DoConstruct>>
-! CHECK:       <<End OpenMPConstruct>>
diff --git a/flang/test/Lower/OpenMP/FIR/private-commonblock.f90 b/flang/test/Lower/OpenMP/FIR/private-commonblock.f90
deleted file mode 100644
index 90036e0c0c7e..000000000000
--- a/flang/test/Lower/OpenMP/FIR/private-commonblock.f90
+++ /dev/null
@@ -1,109 +0,0 @@
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-
-!CHECK: func.func @_QPprivate_common() {
-!CHECK: omp.parallel {
-!CHECK: %[[X:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFprivate_commonEx"}
-!CHECK: %[[Y:.*]] = fir.alloca f32 {bindc_name = "y", pinned, uniq_name = "_QFprivate_commonEy"}
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: return
-!CHECK: }
-subroutine private_common
-  common /c/ x, y
-  real x, y
-  !$omp parallel private(/c/)
-  !$omp end parallel
-end subroutine
-
-!CHECK: %[[val_0:.*]] = fir.address_of(@blk_) : !fir.ref<!fir.array<74xi8>>
-!CHECK: %[[val_1:.*]] = fir.convert %0 : (!fir.ref<!fir.array<74xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c0:.*]] = arith.constant 0 : index
-!CHECK: %[[val_2:.*]] = fir.coordinate_of %[[val_1]], %[[val_c0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_3:.*]] = fir.convert %[[val_2]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: %[[val_4:.*]] = fir.convert %[[val_0]] : (!fir.ref<!fir.array<74xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c4:.*]] = arith.constant 4 : index
-!CHECK: %[[val_5:.*]] = fir.coordinate_of %[[val_4]], %[[val_c4]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_6:.*]] = fir.convert %[[val_5]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<10xf32>>
-!CHECK: %[[val_7:.*]] = fir.convert %[[val_0]] : (!fir.ref<!fir.array<74xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c44:.*]] = arith.constant 44 : index
-!CHECK: %[[val_8:.*]] = fir.coordinate_of %[[val_7]], %[[val_c44]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_9:.*]] = fir.convert %[[val_8]] : (!fir.ref<i8>) -> !fir.ref<!fir.char<1,5>>
-!CHECK: %[[val_c5:.*]] = arith.constant 5 : index
-!CHECK: %[[val_10:.*]] = fir.convert %[[val_0]] : (!fir.ref<!fir.array<74xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c49:.*]] = arith.constant 49 : index
-!CHECK: %[[val_11:.*]] = fir.coordinate_of %[[val_10]], %[[val_c49]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_12:.*]] = fir.convert %[[val_11]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<5x!fir.char<1,5>>>
-!CHECK: %[[val_c5_0:.*]] = arith.constant 5 : index
-!CHECK: %[[val_14:.*]] = fir.emboxchar %[[val_9]], %[[val_c5]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
-!CHECK: %[[val_15:.*]] = fir.convert %[[val_12]] : (!fir.ref<!fir.array<5x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,?>>
-!CHECK: %[[val_16:.*]] = fir.emboxchar %[[val_15]], %[[val_c5_0]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
-!CHECK: fir.call @_QPsub1(%[[val_3]], %[[val_6]], %[[val_14]], %[[val_16]]) fastmath<contract> : (!fir.ref<i32>, !fir.ref<!fir.array<10xf32>>, !fir.boxchar<1>, !fir.boxchar<1>) -> ()
-!CHECK: omp.parallel {
-!CHECK: %[[val_21:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFprivate_clause_commonblockEa"}
-!CHECK: %[[val_22:.*]] = fir.alloca !fir.array<10xf32> {bindc_name = "b", pinned, uniq_name = "_QFprivate_clause_commonblockEb"}
-!CHECK: %[[val_23:.*]] = fir.alloca !fir.char<1,5> {bindc_name = "c", pinned, uniq_name = "_QFprivate_clause_commonblockEc"}
-!CHECK: %[[val_24:.*]] = fir.alloca !fir.array<5x!fir.char<1,5>> {bindc_name = "d", pinned, uniq_name = "_QFprivate_clause_commonblockEd"}
-!CHECK: %[[val_26:.*]] = fir.emboxchar %[[val_23]], %[[val_c5]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
-!CHECK: %[[val_27:.*]] = fir.convert %[[val_24]] : (!fir.ref<!fir.array<5x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,?>>
-!CHECK: %[[val_28:.*]] = fir.emboxchar %[[val_27]], %[[val_c5_0]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
-!CHECK: fir.call @_QPsub2(%[[val_21]], %[[val_22]], %[[val_26]], %[[val_28]]) fastmath<contract> : (!fir.ref<i32>, !fir.ref<!fir.array<10xf32>>, !fir.boxchar<1>, !fir.boxchar<1>) -> ()
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: %[[val_18:.*]] = fir.emboxchar %[[val_9]], %[[val_c5]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
-!CHECK: %[[val_19:.*]] = fir.convert %[[val_12]] : (!fir.ref<!fir.array<5x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,?>>
-!CHECK: %[[val_20:.*]] = fir.emboxchar %[[val_19]], %[[val_c5_0]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
-!CHECK: fir.call @_QPsub3(%[[val_3]], %[[val_6]], %[[val_18]], %[[val_20]]) fastmath<contract> : {{.*}}
-!CHECK: return
-!CHECK: }
-subroutine private_clause_commonblock()
-  integer::a
-  real::b(10)
-  character(5):: c, d(5)
-  common /blk/ a, b, c, d
-  
-  call sub1(a, b, c, d)
-  !$omp parallel private(/blk/)
-        call sub2(a, b, c, d)
-  !$omp end parallel
-  call sub3(a, b, c, d)
-end subroutine
-
-!CHECK: func.func @_QPprivate_clause_commonblock_pointer() {
-!CHECK: %[[val_0:.*]] = fir.address_of(@blk_) : !fir.ref<!fir.array<74xi8>>
-!CHECK: %[[val_1:.*]] = fir.convert %[[val_0]] : (!fir.ref<!fir.array<74xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c24:.*]] = arith.constant 24 : index
-!CHECK: %[[val_2:.*]] = fir.coordinate_of %[[val_1]], %[[val_c24]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_3:.*]] = fir.convert %[[val_2]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: %[[val_4:.*]] = fir.convert %[[val_0]] : (!fir.ref<!fir.array<74xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK: %[[val_c0:.*]] = arith.constant 0 : index
-!CHECK: %[[val_5:.*]] = fir.coordinate_of %[[val_4]], %[[val_c0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_6:.*]] = fir.convert %[[val_5]] : (!fir.ref<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
-!CHECK: %[[val_7:.*]] = fir.load %[[val_6]] : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
-!CHECK: %[[val_8:.*]] = fir.box_addr %[[val_7]] : (!fir.box<!fir.ptr<!fir.complex<4>>>) -> !fir.ptr<!fir.complex<4>>
-!CHECK: %[[val_9:.*]] = fir.convert %[[val_8]] : (!fir.ptr<!fir.complex<4>>) -> !fir.ref<!fir.complex<4>>
-!CHECK: fir.call @_QPsub4(%[[val_9]], %[[val_3]]) fastmath<contract> : (!fir.ref<!fir.complex<4>>, !fir.ref<i32>) -> ()
-!CHECK: omp.parallel {
-!CHECK: %[[val_13:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.complex<4>>> {bindc_name = "c", pinned, uniq_name = "_QFprivate_clause_commonblock_pointerEc"}
-!CHECK: %[[val_14:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFprivate_clause_commonblock_pointerEa"}
-!CHECK: %[[val_15:.*]] = fir.load %[[val_13]] : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
-!CHECK: %[[val_16:.*]] = fir.box_addr %[[val_15]] : (!fir.box<!fir.ptr<!fir.complex<4>>>) -> !fir.ptr<!fir.complex<4>>
-!CHECK: %[[val_17:.*]] = fir.convert %[[val_16]] : (!fir.ptr<!fir.complex<4>>) -> !fir.ref<!fir.complex<4>>
-!CHECK: fir.call @_QPsub5(%[[val_17]], %[[val_14]]) fastmath<contract> : (!fir.ref<!fir.complex<4>>, !fir.ref<i32>) -> ()
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: %[[val_10:.*]] = fir.load %[[val_6]] : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
-!CHECK: %[[val_11:.*]] = fir.box_addr %[[val_10]] : (!fir.box<!fir.ptr<!fir.complex<4>>>) -> !fir.ptr<!fir.complex<4>>
-!CHECK: %[[val_12:.*]] = fir.convert %[[val_11]] : (!fir.ptr<!fir.complex<4>>) -> !fir.ref<!fir.complex<4>>
-!CHECK: fir.call @_QPsub6(%[[val_12]], %[[val_3]]) fastmath<contract> : (!fir.ref<!fir.complex<4>>, !fir.ref<i32>) -> ()
-!CHECK: return
-!CHECK: }
-subroutine private_clause_commonblock_pointer()
-  complex, pointer :: c
-  integer:: a
-  common /blk/ c, a
-  call sub4(c, a)
-  !$omp parallel private(/blk/)
-        call sub5(c, a)
-  !$omp end parallel
-  call sub6(c, a)
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/requires-common.f90 b/flang/test/Lower/OpenMP/FIR/requires-common.f90
deleted file mode 100644
index 2e112d72de3f..000000000000
--- a/flang/test/Lower/OpenMP/FIR/requires-common.f90
+++ /dev/null
@@ -1,19 +0,0 @@
-! RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s
-! RUN: bbc -fopenmp -emit-fir %s -o - | FileCheck %s
-! RUN: bbc -fopenmp -fopenmp-is-target-device -emit-fir %s -o - | FileCheck %s
-
-! This test checks the lowering of requires into MLIR
-
-!CHECK:      module attributes {
-!CHECK-SAME: omp.requires = #omp<clause_requires unified_shared_memory>
-block data init
-  !$omp requires unified_shared_memory
-  integer :: x
-  common /block/ x
-  data x / 10 /
-end
-
-subroutine f
-  !$omp declare target
-end subroutine f
diff --git a/flang/test/Lower/OpenMP/FIR/requires-notarget.f90 b/flang/test/Lower/OpenMP/FIR/requires-notarget.f90
deleted file mode 100644
index bfa509208428..000000000000
--- a/flang/test/Lower/OpenMP/FIR/requires-notarget.f90
+++ /dev/null
@@ -1,14 +0,0 @@
-! RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s
-! RUN: bbc -fopenmp -emit-fir %s -o - | FileCheck %s
-! RUN: bbc -fopenmp -fopenmp-is-target-device -emit-fir %s -o - | FileCheck %s
-
-! This test checks that requires lowering into MLIR skips creating the
-! omp.requires attribute with target-related clauses if there are no device
-! functions in the compilation unit
-
-!CHECK:      module attributes {
-!CHECK-NOT:  omp.requires
-program requires
-  !$omp requires unified_shared_memory reverse_offload atomic_default_mem_order(seq_cst)
-end program requires
diff --git a/flang/test/Lower/OpenMP/FIR/requires.f90 b/flang/test/Lower/OpenMP/FIR/requires.f90
deleted file mode 100644
index bc53931b9f24..000000000000
--- a/flang/test/Lower/OpenMP/FIR/requires.f90
+++ /dev/null
@@ -1,14 +0,0 @@
-! RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s
-! RUN: bbc -fopenmp -emit-fir %s -o - | FileCheck %s
-! RUN: bbc -fopenmp -fopenmp-is-target-device -emit-fir %s -o - | FileCheck %s
-
-! This test checks the lowering of requires into MLIR
-
-!CHECK:      module attributes {
-!CHECK-SAME: omp.requires = #omp<clause_requires reverse_offload|unified_shared_memory>
-program requires
-  !$omp requires unified_shared_memory reverse_offload atomic_default_mem_order(seq_cst)
-  !$omp target
-  !$omp end target
-end program requires
diff --git a/flang/test/Lower/OpenMP/FIR/rtl-flags.f90 b/flang/test/Lower/OpenMP/FIR/rtl-flags.f90
deleted file mode 100644
index ad8eb9e73213..000000000000
--- a/flang/test/Lower/OpenMP/FIR/rtl-flags.f90
+++ /dev/null
@@ -1,39 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DEFAULT-DEVICE-FIR
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=45 %s -o - | FileCheck %s --check-prefix=DEFAULT-HOST-FIR
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-is-target-device -fopenmp-version=45  %s -o - | FileCheck %s --check-prefix=DEFAULT-DEVICE-FIR-VERSION
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=45 %s -o - | FileCheck %s --check-prefix=DEFAULT-HOST-FIR-VERSION
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-target-debug -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DBG-DEVICE-FIR
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-target-debug=111 -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=DBG-EQ-DEVICE-FIR
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-assume-teams-oversubscription -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=TEAMS-OSUB-DEVICE-FIR
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-assume-threads-oversubscription -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=THREAD-OSUB-DEVICE-FIR
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-assume-no-thread-state -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=THREAD-STATE-DEVICE-FIR
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-assume-no-nested-parallelism -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=NEST-PAR-DEVICE-FIR
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-target-debug -fopenmp-assume-teams-oversubscription -fopenmp-assume-no-nested-parallelism -fopenmp-assume-threads-oversubscription -fopenmp-assume-no-thread-state -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefix=ALL-DEVICE-FIR
-!RUN: bbc -emit-fir -fopenmp -fopenmp-is-target-device -o - %s | FileCheck %s --check-prefix=DEFAULT-DEVICE-FIR
-!RUN: bbc -emit-fir -fopenmp -fopenmp-is-target-device -fopenmp-version=45 -o - %s | FileCheck %s --check-prefix=DEFAULT-DEVICE-FIR-VERSION
-!RUN: bbc -emit-fir -fopenmp -o - %s | FileCheck %s --check-prefix=DEFAULT-HOST-FIR
-!RUN: bbc -emit-fir -fopenmp -fopenmp-version=45 -o - %s | FileCheck %s --check-prefix=DEFAULT-HOST-FIR-VERSION
-!RUN: bbc -emit-fir -fopenmp -fopenmp-target-debug=111 -fopenmp-is-target-device -o - %s | FileCheck %s --check-prefix=DBG-EQ-DEVICE-FIR
-!RUN: bbc -emit-fir -fopenmp -fopenmp-assume-teams-oversubscription -fopenmp-is-target-device -o - %s | FileCheck %s --check-prefix=TEAMS-OSUB-DEVICE-FIR
-!RUN: bbc -emit-fir -fopenmp -fopenmp-assume-threads-oversubscription -fopenmp-is-target-device -o - %s | FileCheck %s --check-prefix=THREAD-OSUB-DEVICE-FIR
-!RUN: bbc -emit-fir -fopenmp -fopenmp-assume-no-thread-state -fopenmp-is-target-device -o - %s | FileCheck %s --check-prefix=THREAD-STATE-DEVICE-FIR
-!RUN: bbc -emit-fir -fopenmp -fopenmp-assume-no-nested-parallelism -fopenmp-is-target-device -o - %s | FileCheck %s --check-prefix=NEST-PAR-DEVICE-FIR
-!RUN: bbc -emit-fir -fopenmp -fopenmp-target-debug=1 -fopenmp-assume-teams-oversubscription -fopenmp-assume-no-nested-parallelism -fopenmp-assume-threads-oversubscription -fopenmp-assume-no-thread-state -fopenmp-is-target-device -o - %s | FileCheck %s --check-prefix=ALL-DEVICE-FIR
-
-!DEFAULT-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<openmp_device_version = 11>
-!DEFAULT-DEVICE-FIR-SAME: omp.is_target_device = true
-!DEFAULT-DEVICE-FIR-VERSION: module attributes {{{.*}}omp.flags = #omp.flags<openmp_device_version = 45>
-!DEFAULT-DEVICE-FIR-VERSION-SAME: omp.is_target_device = true
-!DEFAULT-DEVICE-FIR-VERSION-SAME: omp.version = #omp.version<version = 45>
-!DEFAULT-HOST-FIR: module attributes {{{.*}}omp.is_target_device = false{{.*}}
-!DEFAULT-HOST-FIR-VERSION: module attributes {{{.*}}omp.is_target_device = false
-!DEFAULT-HOST-FIR-VERSION-SAME: omp.version = #omp.version<version = 45>
-!DBG-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<debug_kind = 1, openmp_device_version = 11>
-!DBG-EQ-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<debug_kind = 111, openmp_device_version = 11>
-!TEAMS-OSUB-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<assume_teams_oversubscription = true, openmp_device_version = 11>
-!THREAD-OSUB-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<assume_threads_oversubscription = true, openmp_device_version = 11>
-!THREAD-STATE-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<assume_no_thread_state = true, openmp_device_version = 11>
-!NEST-PAR-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<assume_no_nested_parallelism = true, openmp_device_version = 11>
-!ALL-DEVICE-FIR: module attributes {{{.*}}omp.flags = #omp.flags<debug_kind = 1, assume_teams_oversubscription = true, assume_threads_oversubscription = true, assume_no_thread_state = true, assume_no_nested_parallelism = true, openmp_device_version = 11>
-subroutine omp_subroutine()
-end subroutine omp_subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/sections-pft.f90 b/flang/test/Lower/OpenMP/FIR/sections-pft.f90
deleted file mode 100644
index 7b20a87022c9..000000000000
--- a/flang/test/Lower/OpenMP/FIR/sections-pft.f90
+++ /dev/null
@@ -1,91 +0,0 @@
-! RUN: %flang_fc1 -fdebug-pre-fir-tree -fopenmp %s | FileCheck %s
-
-subroutine openmp_sections(x, y)
-
-  integer, intent(inout)::x, y
-
-!==============================================================================
-! empty construct
-!==============================================================================
-!$omp sections
-!$omp end sections
-
-!CHECK: OpenMPConstruct
-!CHECK: End OpenMPConstruct
-
-!==============================================================================
-! single section, without `!$omp section`
-!==============================================================================
-!$omp sections
-    call F1()
-!$omp end sections
-
-!CHECK: OpenMPConstruct
-!CHECK:  OpenMPConstruct
-!CHECK:   CallStmt
-!CHECK:  End OpenMPConstruct
-!CHECK: End OpenMPConstruct
-
-!==============================================================================
-! single section with `!$omp section`
-!==============================================================================
-!$omp sections
-  !$omp section
-    call F1
-!$omp end sections
-
-!CHECK: OpenMPConstruct
-!CHECK:  OpenMPConstruct
-!CHECK:   CallStmt
-!CHECK:  End OpenMPConstruct
-!CHECK: End OpenMPConstruct
-
-!==============================================================================
-! multiple sections
-!==============================================================================
-!$omp sections
-  !$omp section
-    call F1
-  !$omp section
-    call F2
-  !$omp section
-    call F3
-!$omp end sections
-
-!CHECK: OpenMPConstruct
-!CHECK:  OpenMPConstruct
-!CHECK:   CallStmt
-!CHECK:  End OpenMPConstruct
-!CHECK:  OpenMPConstruct
-!CHECK:   CallStmt
-!CHECK:  End OpenMPConstruct
-!CHECK:  OpenMPConstruct
-!CHECK:   CallStmt
-!CHECK:  End OpenMPConstruct
-!CHECK: End OpenMPConstruct
-
-!==============================================================================
-! multiple sections with clauses
-!==============================================================================
-!$omp sections PRIVATE(x) FIRSTPRIVATE(y)
-  !$omp section
-    call F1
-  !$omp section
-    call F2
-  !$omp section
-    call F3
-!$omp end sections NOWAIT
-
-!CHECK: OpenMPConstruct
-!CHECK:  OpenMPConstruct
-!CHECK:   CallStmt
-!CHECK:  End OpenMPConstruct
-!CHECK:  OpenMPConstruct
-!CHECK:   CallStmt
-!CHECK:  End OpenMPConstruct
-!CHECK:  OpenMPConstruct
-!CHECK:   CallStmt
-!CHECK:  End OpenMPConstruct
-!CHECK: End OpenMPConstruct
-
-end subroutine openmp_sections
diff --git a/flang/test/Lower/OpenMP/FIR/sections.f90 b/flang/test/Lower/OpenMP/FIR/sections.f90
deleted file mode 100644
index 7b313f3dc0b4..000000000000
--- a/flang/test/Lower/OpenMP/FIR/sections.f90
+++ /dev/null
@@ -1,288 +0,0 @@
-! REQUIRES: openmp_runtime
-
-! This test checks the lowering of OpenMP sections construct with several clauses present
-
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-!CHECK: func @_QQmain() attributes {fir.bindc_name = "sample"} {
-!CHECK:   %[[COUNT:.*]] = fir.address_of(@_QFEcount) : !fir.ref<i32>
-!CHECK:   %[[ETA:.*]] = fir.alloca f32 {bindc_name = "eta", uniq_name = "_QFEeta"}
-!CHECK:   %[[CONST_1:.*]] = arith.constant 4 : i64
-!CHECK:   omp.sections allocate(%[[CONST_1]] : i64 -> %0 : !fir.ref<i32>)  {
-!CHECK:     omp.section {
-!CHECK:       %[[PRIVATE_ETA:.*]] = fir.alloca f32 {bindc_name = "eta", pinned, uniq_name = "_QFEeta"}
-!CHECK:       %[[PRIVATE_DOUBLE_COUNT:.*]] = fir.alloca i32 {bindc_name = "double_count", pinned, uniq_name = "_QFEdouble_count"} 
-!CHECK:       %[[const:.*]] = arith.constant 5 : i32
-!CHECK:       fir.store %[[const]] to %[[COUNT]] : !fir.ref<i32>
-!CHECK:       %[[temp_count:.*]] = fir.load %[[COUNT]] : !fir.ref<i32>
-!CHECK:       %[[temp_double_count:.*]] = fir.load %[[PRIVATE_DOUBLE_COUNT]] : !fir.ref<i32>
-!CHECK:       %[[result:.*]] = arith.muli %[[temp_count]], %[[temp_double_count]] : i32
-!CHECK:       {{.*}} = fir.convert %[[result]] : (i32) -> f32
-!CHECK:       fir.store {{.*}} to %[[PRIVATE_ETA]] : !fir.ref<f32>
-!CHECK:       omp.terminator
-!CHECK:     }
-!CHECK:     omp.section {
-!CHECK:       %[[PRIVATE_ETA:.*]] = fir.alloca f32 {bindc_name = "eta", pinned, uniq_name = "_QFEeta"}
-!CHECK:       %[[PRIVATE_DOUBLE_COUNT:.*]] = fir.alloca i32 {bindc_name = "double_count", pinned, uniq_name = "_QFEdouble_count"} 
-!CHECK:       %[[temp:.*]] = fir.load %[[PRIVATE_DOUBLE_COUNT]] : !fir.ref<i32>
-!CHECK:       %[[const:.*]] = arith.constant 1 : i32
-!CHECK:       %[[result:.*]] = arith.addi %[[temp]], %[[const]] : i32
-!CHECK:       fir.store %[[result]] to %[[PRIVATE_DOUBLE_COUNT]] : !fir.ref<i32>
-!CHECK:       omp.terminator
-!CHECK:     }
-!CHECK:     omp.section {
-!CHECK:       %[[PRIVATE_ETA:.*]] = fir.alloca f32 {bindc_name = "eta", pinned, uniq_name = "_QFEeta"}
-!CHECK:       %[[PRIVATE_DOUBLE_COUNT:.*]] = fir.alloca i32 {bindc_name = "double_count", pinned, uniq_name = "_QFEdouble_count"} 
-!CHECK:       %[[temp:.*]] = fir.load %[[PRIVATE_ETA]] : !fir.ref<f32>
-!CHECK:       %[[const:.*]] = arith.constant 7.000000e+00 : f32
-!CHECK:       %[[result:.*]] = arith.subf %[[temp]], %[[const]] {{.*}}: f32
-!CHECK:       fir.store %[[result]] to %[[PRIVATE_ETA]] : !fir.ref<f32>
-!CHECK:       {{.*}} = fir.load %[[COUNT]] : !fir.ref<i32>
-!CHECK:       %[[temp_count:.*]] = fir.convert {{.*}} : (i32) -> f32
-!CHECK:       %[[temp_eta:.*]] = fir.load %[[PRIVATE_ETA]] : !fir.ref<f32>
-!CHECK:       {{.*}} = arith.mulf %[[temp_count]], %[[temp_eta]] {{.*}}: f32
-!CHECK:       %[[result:.*]] = fir.convert {{.*}} : (f32) -> i32
-!CHECK:       fir.store %[[result]] to %[[COUNT]] : !fir.ref<i32>
-!CHECK:       {{.*}} = fir.load %[[COUNT]] : !fir.ref<i32>
-!CHECK:       %[[temp_count:.*]] = fir.convert {{.*}} : (i32) -> f32
-!CHECK:       %[[temp_eta:.*]] = fir.load %[[PRIVATE_ETA]] : !fir.ref<f32>
-!CHECK:       {{.*}} = arith.subf %[[temp_count]], %[[temp_eta]] {{.*}}: f32
-!CHECK:       %[[result:.*]] = fir.convert {{.*}} : (f32) -> i32
-!CHECK:       fir.store %[[result]] to %[[PRIVATE_DOUBLE_COUNT]] : !fir.ref<i32>
-!CHECK:       omp.terminator
-!CHECK:     }
-!CHECK:     omp.terminator
-!CHECK:   }
-!CHECK:   omp.sections nowait {
-!CHECK:     omp.terminator
-!CHECK:   }
-!CHECK:   return
-!CHECK: }
-
-program sample
-    use omp_lib
-    integer :: count = 0, double_count = 1
-    !$omp sections private (eta, double_count) allocate(omp_high_bw_mem_alloc: count)
-        !$omp section
-            count = 1 + 4
-            eta = count * double_count
-        !$omp section
-            double_count = double_count + 1
-        !$omp section
-            eta = eta - 7
-            count = count * eta
-            double_count = count - eta
-    !$omp end sections
-
-    !$omp sections
-    !$omp end sections nowait
-end program sample
-
-!CHECK: func @_QPfirstprivate(%[[ARG:.*]]: !fir.ref<f32> {fir.bindc_name = "alpha"}) {
-!CHECK:   omp.sections {
-!CHECK:     omp.section  {
-!CHECK:         %[[PRIVATE_ALPHA:.*]] = fir.alloca f32 {bindc_name = "alpha", pinned, uniq_name = "_QFfirstprivateEalpha"}
-!CHECK:         %[[temp:.*]] = fir.load %[[ARG]] : !fir.ref<f32>
-!CHECK:         fir.store %[[temp]] to %[[PRIVATE_ALPHA]] : !fir.ref<f32>
-!CHECK:       omp.terminator
-!CHECK:     }
-!CHECK:     omp.terminator
-!CHECK:   }
-!CHECK:   omp.sections {
-!CHECK:     omp.section  {
-!CHECK:       %[[PRIVATE_VAR:.*]] = fir.load %[[ARG]] : !fir.ref<f32>
-!CHECK:       %[[CONSTANT:.*]] = arith.constant 5.000000e+00 : f32
-!CHECK:       %[[PRIVATE_VAR_2:.*]] = arith.mulf %[[PRIVATE_VAR]], %[[CONSTANT]] {{.*}}: f32
-!CHECK:       fir.store %[[PRIVATE_VAR_2]] to %[[ARG]] : !fir.ref<f32>
-!CHECK:       omp.terminator
-!CHECK:     }
-!CHECK:     omp.terminator
-!CHECK:   }
-!CHECK:   return
-!CHECK: }
-
-subroutine firstprivate(alpha)
-    real :: alpha
-    !$omp sections firstprivate(alpha)
-    !$omp end sections
-
-    !$omp sections
-        alpha = alpha * 5
-    !$omp end sections
-end subroutine
-
-subroutine lastprivate()
-	integer :: x
-!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFlastprivateEx"}
-!CHECK: omp.sections   {
-	!$omp sections lastprivate(x)
-!CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
-!CHECK: %[[const:.*]] = arith.constant 10 : i32
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[result:.*]] = arith.muli %c10_i32, %[[temp]] : i32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-        !$omp section
-            x = x * 10
-!CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 1 : i32
-!CHECK: %[[result:.*]] = arith.addi %[[temp]], %[[const]] : i32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[true:.*]] = arith.constant true
-!CHECK: fir.if %[[true]] {
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[X]] : !fir.ref<i32>
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-        !$omp section
-            x = x + 1
-!CHECK: omp.terminator
-!CHECK: }
-    !$omp end sections
-    
-!CHECK: omp.sections   {
-    !$omp sections firstprivate(x) lastprivate(x)
-!CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.barrier
-!CHECK: %[[const:.*]] = arith.constant 10 : i32
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[result:.*]] = arith.muli %c10_i32, %[[temp]] : i32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-        !$omp section
-            x = x * 10
-!CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.barrier
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 1 : i32
-!CHECK: %[[result:.*]] = arith.addi %[[temp]], %[[const]] : i32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[true:.*]] = arith.constant true
-!CHECK: fir.if %true {
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[X]] : !fir.ref<i32>
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-        !$omp section
-            x = x + 1
-!CHECK: omp.terminator
-!CHECK: }
-    !$omp end sections
-
-!CHECK: omp.sections nowait {
-    !$omp sections firstprivate(x) lastprivate(x)
-!CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.barrier
-!CHECK: %[[const:.*]] = arith.constant 10 : i32
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[result:.*]] = arith.muli %c10_i32, %[[temp]] : i32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-        !$omp section
-            x = x * 10
-!CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: omp.barrier
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 1 : i32
-!CHECK: %[[result:.*]] = arith.addi %[[temp]], %[[const]] : i32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[true:.*]] = arith.constant true
-!CHECK: fir.if %true {
-!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: fir.store %[[temp]] to %[[X]] : !fir.ref<i32>
-!CHECK: omp.barrier
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-        !$omp section
-            x = x + 1
-!CHECK: omp.terminator
-!CHECK: }
-    !$omp end sections nowait
-
-!CHECK: omp.sections {
-!CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
-!CHECK: cf.br ^bb1
-!CHECK: ^bb1:  // pred: ^bb0
-!CHECK: %[[INNER_PRIVATE_X:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[const:.*]] = arith.constant 1 : i32
-!CHECK: %[[result:.*]] = arith.addi %[[INNER_PRIVATE_X]], %[[const]] : i32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: %[[loaded_value:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
-!CHECK: fir.store %[[loaded_value]] to %[[X]] : !fir.ref<i32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: return
-!CHECK: }
-
-    !$omp sections lastprivate(x)
-        !$omp section
-                goto 30
-        30  x = x + 1
-    !$omp end sections
-end subroutine
-
-subroutine unstructured_sections_privatization()
-!CHECK: %[[X:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFunstructured_sections_privatizationEx"}
-!CHECK: omp.sections {
-!CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFunstructured_sections_privatizationEx"}
-!CHECK: cf.br ^bb1
-!CHECK: ^bb1:  // pred: ^bb0
-!CHECK: %[[INNER_PRIVATE_X:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<f32>
-!CHECK: %[[constant:.*]] = arith.constant 1.000000e+00 : f32
-!CHECK: %[[result:.*]] = arith.addf %[[INNER_PRIVATE_X]], %[[constant]] fastmath<contract> : f32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<f32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-    !$omp sections private(x)
-        !$omp section
-            goto 40
-        40  x = x + 1
-    !$omp end sections
-!CHECK: omp.sections {
-!CHECK: omp.section {
-!CHECK: %[[PRIVATE_X:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFunstructured_sections_privatizationEx"}
-!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<f32>
-!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<f32>
-!CHECK: cf.br ^bb1
-!CHECK: ^bb1:  // pred: ^bb0
-!CHECK: %[[INNER_PRIVATE_X:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<f32>
-!CHECK: %[[constant:.*]] = arith.constant 1.000000e+00 : f32
-!CHECK: %[[result:.*]] = arith.addf %[[INNER_PRIVATE_X]], %[[constant]] fastmath<contract> : f32
-!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<f32>
-!CHECK: omp.terminator
-!CHECK: }
-!CHECK: omp.terminator
-!CHECK: }
-    !$omp sections firstprivate(x)
-        !$omp section
-            goto 50
-        50  x = x + 1
-    !$omp end sections
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/simd.f90 b/flang/test/Lower/OpenMP/FIR/simd.f90
deleted file mode 100644
index 91e8750578bf..000000000000
--- a/flang/test/Lower/OpenMP/FIR/simd.f90
+++ /dev/null
@@ -1,176 +0,0 @@
-! Tests for 2.9.3.1 Simd
-
-! The "if" clause was added to the "simd" directive in OpenMP 5.0.
-! RUN: bbc -fopenmp -fopenmp-version=50 -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-!CHECK-LABEL: func @_QPsimd()
-subroutine simd
-  integer :: i
-  !$OMP SIMD
-  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: %[[UB:.*]] = arith.constant 9 : i32
-  ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: omp.simd {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  do i=1, 9
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  !$OMP END SIMD
-end subroutine
-
-!CHECK-LABEL: func @_QPsimd_with_if_clause
-subroutine simd_with_if_clause(n, threshold)
-  integer :: i, n, threshold
-  !$OMP SIMD IF( n .GE. threshold )
-  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[UB:.*]] = fir.load %arg0
-  ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[COND:.*]] = arith.cmpi sge
-  ! CHECK: omp.simd if(%[[COND:.*]]) {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  do i = 1, n
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  !$OMP END SIMD
-end subroutine
-
-!CHECK-LABEL: func @_QPsimd_with_simdlen_clause
-subroutine simd_with_simdlen_clause(n, threshold)
-  integer :: i, n, threshold
-  !$OMP SIMD SIMDLEN(2)
-  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[UB:.*]] = fir.load %arg0
-  ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd simdlen(2) {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  do i = 1, n
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  !$OMP END SIMD
-end subroutine
-
-!CHECK-LABEL: func @_QPsimd_with_simdlen_clause_from_param
-subroutine simd_with_simdlen_clause_from_param(n, threshold)
-  integer :: i, n, threshold
-  integer, parameter :: simdlen = 2;
-  !$OMP SIMD SIMDLEN(simdlen)
-  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[UB:.*]] = fir.load %arg0
-  ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd simdlen(2) {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  do i = 1, n
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  !$OMP END SIMD
-end subroutine
-
-!CHECK-LABEL: func @_QPsimd_with_simdlen_clause_from_expr_from_param
-subroutine simd_with_simdlen_clause_from_expr_from_param(n, threshold)
-  integer :: i, n, threshold
-  integer, parameter :: simdlen = 2;
-  !$OMP SIMD SIMDLEN(simdlen*2 + 2)
-  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[UB:.*]] = fir.load %arg0
-  ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd simdlen(6) {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  do i = 1, n
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  !$OMP END SIMD
-end subroutine
-
-!CHECK-LABEL: func @_QPsimd_with_safelen_clause
-subroutine simd_with_safelen_clause(n, threshold)
-  integer :: i, n, threshold
-  !$OMP SIMD SAFELEN(2)
-  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[UB:.*]] = fir.load %arg0
-  ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd safelen(2) {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  do i = 1, n
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  !$OMP END SIMD
-end subroutine
-
-!CHECK-LABEL: func @_QPsimd_with_safelen_clause_from_expr_from_param
-subroutine simd_with_safelen_clause_from_expr_from_param(n, threshold)
-  integer :: i, n, threshold
-  integer, parameter :: safelen = 2;
-  !$OMP SIMD SAFELEN(safelen*2 + 2)
-  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[UB:.*]] = fir.load %arg0
-  ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd safelen(6) {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  do i = 1, n
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  !$OMP END SIMD
-end subroutine
-
-!CHECK-LABEL: func @_QPsimd_with_simdlen_safelen_clause
-subroutine simd_with_simdlen_safelen_clause(n, threshold)
-  integer :: i, n, threshold
-  !$OMP SIMD SIMDLEN(1) SAFELEN(2)
-  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[UB:.*]] = fir.load %arg0
-  ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd simdlen(1) safelen(2) {
-  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-  do i = 1, n
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  !$OMP END SIMD
-end subroutine
-
-!CHECK-LABEL: func @_QPsimd_with_collapse_clause
-subroutine simd_with_collapse_clause(n)
-  integer :: i, j, n
-  integer :: A(n,n)
-  ! CHECK: %[[LOWER_I:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[UPPER_I:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref<i32>
-  ! CHECK: %[[STEP_I:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[LOWER_J:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[UPPER_J:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref<i32>
-  ! CHECK: %[[STEP_J:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simd {
-  ! CHECK-NEXT: omp.loop_nest (%[[ARG_0:.*]], %[[ARG_1:.*]]) : i32 = (
-  ! CHECK-SAME:               %[[LOWER_I]], %[[LOWER_J]]) to (
-  ! CHECK-SAME:               %[[UPPER_I]], %[[UPPER_J]]) inclusive step (
-  ! CHECK-SAME:               %[[STEP_I]], %[[STEP_J]]) {
-  !$OMP SIMD COLLAPSE(2)
-  do i = 1, n
-    do j = 1, n
-       A(i,j) = i + j
-    end do
-  end do
-  !$OMP END SIMD
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/simple-barrier.f90 b/flang/test/Lower/OpenMP/FIR/simple-barrier.f90
deleted file mode 100644
index c621b8062eaa..000000000000
--- a/flang/test/Lower/OpenMP/FIR/simple-barrier.f90
+++ /dev/null
@@ -1,6 +0,0 @@
-! RUN: bbc -fopenmp -emit-fir -o - %s | FileCheck %s
-
-subroutine sample()
-! CHECK: omp.barrier
-!$omp barrier
-end subroutine sample
diff --git a/flang/test/Lower/OpenMP/FIR/single.f90 b/flang/test/Lower/OpenMP/FIR/single.f90
deleted file mode 100644
index 65ae07c2c284..000000000000
--- a/flang/test/Lower/OpenMP/FIR/single.f90
+++ /dev/null
@@ -1,123 +0,0 @@
-! REQUIRES: openmp_runtime
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-!RUN: bbc -emit-fir -hlfir=false -fopenmp %s -o - | FileCheck %s
-
-!===============================================================================
-! Single construct
-!===============================================================================
-
-!CHECK-LABEL: func @_QPomp_single
-!CHECK-SAME: (%[[x:.*]]: !fir.ref<i32> {fir.bindc_name = "x"})
-subroutine omp_single(x)
-  integer, intent(inout) :: x
-  !CHECK: omp.parallel
-  !$omp parallel
-  !CHECK: omp.single
-  !$omp single
-    !CHECK: %[[xval:.*]] = fir.load %[[x]] : !fir.ref<i32>
-    !CHECK: %[[res:.*]] = arith.addi %[[xval]], %{{.*}} : i32
-    !CHECK: fir.store %[[res]] to %[[x]] : !fir.ref<i32>
-    x = x + 12
-  !CHECK: omp.terminator
-  !$omp end single
-  !CHECK: omp.terminator
-  !$omp end parallel
-end subroutine omp_single
-
-!===============================================================================
-! Single construct with nowait
-!===============================================================================
-
-!CHECK-LABEL: func @_QPomp_single_nowait
-!CHECK-SAME: (%[[x:.*]]: !fir.ref<i32> {fir.bindc_name = "x"})
-subroutine omp_single_nowait(x)
-  integer, intent(inout) :: x
-  !CHECK: omp.parallel
-  !$omp parallel
-  !CHECK: omp.single nowait
-  !$omp single
-    !CHECK: %[[xval:.*]] = fir.load %[[x]] : !fir.ref<i32>
-    !CHECK: %[[res:.*]] = arith.addi %[[xval]], %{{.*}} : i32
-    !CHECK: fir.store %[[res]] to %[[x]] : !fir.ref<i32>
-    x = x + 12
-  !CHECK: omp.terminator
-  !$omp end single nowait
-  !CHECK: omp.terminator
-  !$omp end parallel
-end subroutine omp_single_nowait
-
-!===============================================================================
-! Single construct with allocate
-!===============================================================================
-
-!CHECK-LABEL: func @_QPsingle_allocate
-subroutine single_allocate()
-  use omp_lib
-  integer :: x
-  !CHECK: omp.parallel {
-  !$omp parallel
-  !CHECK: omp.single allocate(%{{.+}} : i64 -> %{{.+}} : !fir.ref<i32>) {
-  !$omp single allocate(omp_high_bw_mem_alloc: x) private(x)
-  !CHECK: arith.addi
-  x = x + 12
-  !CHECK: omp.terminator
-  !$omp end single
-  !CHECK: omp.terminator
-  !$omp end parallel
-end subroutine single_allocate
-
-!===============================================================================
-! Single construct with private/firstprivate
-!===============================================================================
-
-! CHECK-LABEL: func.func @_QPsingle_privatization(
-! CHECK-SAME:                                     %[[VAL_0:.*]]: !fir.ref<f32> {fir.bindc_name = "x"},
-! CHECK-SAME:                                     %[[VAL_1:.*]]: !fir.ref<f64> {fir.bindc_name = "y"}) {
-! CHECK:         omp.single   {
-! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFsingle_privatizationEx"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca f64 {bindc_name = "y", pinned, uniq_name = "_QFsingle_privatizationEy"}
-! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_1]] : !fir.ref<f64>
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_3]] : !fir.ref<f64>
-! CHECK:           fir.call @_QPbar(%[[VAL_2]], %[[VAL_3]]) {{.*}}: (!fir.ref<f32>, !fir.ref<f64>) -> ()
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-subroutine single_privatization(x, y)
-  real :: x
-  real(8) :: y
-
-  !$omp single private(x) firstprivate(y)
-  call bar(x, y)
-  !$omp end single
-end subroutine
-
-! CHECK-LABEL: func.func @_QPsingle_privatization2(
-! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<f32> {fir.bindc_name = "x"},
-! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.ref<f64> {fir.bindc_name = "y"}) {
-! CHECK:         omp.parallel   {
-! CHECK:           omp.single   {
-! CHECK:             %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFsingle_privatization2Ex"}
-! CHECK:             %[[VAL_3:.*]] = fir.alloca f64 {bindc_name = "y", pinned, uniq_name = "_QFsingle_privatization2Ey"}
-! CHECK:             %[[VAL_4:.*]] = fir.load %[[VAL_1]] : !fir.ref<f64>
-! CHECK:             fir.store %[[VAL_4]] to %[[VAL_3]] : !fir.ref<f64>
-! CHECK:             fir.call @_QPbar(%[[VAL_2]], %[[VAL_3]]) {{.*}}: (!fir.ref<f32>, !fir.ref<f64>) -> ()
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-subroutine single_privatization2(x, y)
-  real :: x
-  real(8) :: y
-
-  !$omp parallel
-  !$omp single private(x) firstprivate(y)
-  call bar(x, y)
-  !$omp end single
-  !$omp end parallel
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/stop-stmt-in-region.f90 b/flang/test/Lower/OpenMP/FIR/stop-stmt-in-region.f90
deleted file mode 100644
index 32cc6d17c420..000000000000
--- a/flang/test/Lower/OpenMP/FIR/stop-stmt-in-region.f90
+++ /dev/null
@@ -1,154 +0,0 @@
-! This test checks lowering of stop statement in OpenMP region.
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-! CHECK-LABEL: func.func @_QPtest_stop_in_region1() {
-! CHECK:         omp.parallel   {
-! CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_1:.*]] = arith.constant false
-! CHECK:           %[[VAL_2:.*]] = arith.constant false
-! CHECK:           %[[VAL_3:.*]] = fir.call @_FortranAStopStatement(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) {{.*}} : (i32, i1, i1) -> none
-! CHECK-NOT:       fir.unreachable
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-subroutine test_stop_in_region1()
-  !$omp parallel
-    stop 1
-  !$omp end parallel
-end
-
-! CHECK-LABEL: func.func @_QPtest_stop_in_region2() {
-! CHECK:         %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFtest_stop_in_region2Ex"}
-! CHECK:         omp.parallel   {
-! CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_2:.*]] = arith.constant false
-! CHECK:           %[[VAL_3:.*]] = arith.constant false
-! CHECK:           %[[VAL_4:.*]] = fir.call @_FortranAStopStatement(%[[VAL_1]], %[[VAL_2]], %[[VAL_3]]) {{.*}} : (i32, i1, i1) -> none
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-subroutine test_stop_in_region2()
-  integer :: x
-  !$omp parallel
-    stop 1
-    x = 2
-  !$omp end parallel
-end
-
-! CHECK-LABEL: func.func @_QPtest_stop_in_region3() {
-! CHECK:         %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFtest_stop_in_region3Ex"}
-! CHECK:         omp.parallel   {
-! CHECK:           %[[VAL_1:.*]] = arith.constant 3 : i32
-! CHECK:           fir.store %[[VAL_1]] to %[[VAL_0]] : !fir.ref<i32>
-! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
-! CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_4:.*]] = arith.cmpi sgt, %[[VAL_2]], %[[VAL_3]] : i32
-! CHECK:           cf.cond_br %[[VAL_4]], ^bb1, ^bb2
-! CHECK:         ^bb1:
-! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
-! CHECK:           %[[VAL_6:.*]] = arith.constant false
-! CHECK:           %[[VAL_7:.*]] = arith.constant false
-! CHECK:           %[[VAL_8:.*]] = fir.call @_FortranAStopStatement(%[[VAL_5]], %[[VAL_6]], %[[VAL_7]]) {{.*}} : (i32, i1, i1) -> none
-! CHECK:           omp.terminator
-! CHECK:         ^bb2:
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-subroutine test_stop_in_region3()
-  integer :: x
-  !$omp parallel
-    x = 3
-    if (x > 1) stop x
-  !$omp end parallel
-end
-
-! CHECK-LABEL: func.func @_QPtest_stop_in_region4() {
-! CHECK:         %[[VAL_0:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:         %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_stop_in_region4Ei"}
-! CHECK:         %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFtest_stop_in_region4Ex"}
-! CHECK:         %[[VAL_3:.*]] = arith.constant 1 : i32
-! CHECK:         %[[VAL_4:.*]] = arith.constant 10 : i32
-! CHECK:         %[[VAL_5:.*]] = arith.constant 1 : i32
-! CHECK:         omp.wsloop {
-! CHECK-NEXT:      omp.loop_nest (%[[VAL_6:.*]]) : i32 = (%[[VAL_3]]) to (%[[VAL_4]]) inclusive step (%[[VAL_5]]) {
-! CHECK:             fir.store %[[VAL_6]] to %[[VAL_0]] : !fir.ref<i32>
-! CHECK:             cf.br ^bb1
-! CHECK:           ^bb1:
-! CHECK:             %[[VAL_7:.*]] = arith.constant 3 : i32
-! CHECK:             fir.store %[[VAL_7]] to %[[VAL_2]] : !fir.ref<i32>
-! CHECK:             %[[VAL_8:.*]] = fir.load %[[VAL_2]] : !fir.ref<i32>
-! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_10:.*]] = arith.cmpi sgt, %[[VAL_8]], %[[VAL_9]] : i32
-! CHECK:             cf.cond_br %[[VAL_10]], ^bb2, ^bb3
-! CHECK:           ^bb2:
-! CHECK:             %[[VAL_11:.*]] = fir.load %[[VAL_2]] : !fir.ref<i32>
-! CHECK:             %[[VAL_12:.*]] = arith.constant false
-! CHECK:             %[[VAL_13:.*]] = arith.constant false
-! CHECK:             %[[VAL_14:.*]] = fir.call @_FortranAStopStatement(%[[VAL_11]], %[[VAL_12]], %[[VAL_13]]) {{.*}} : (i32, i1, i1) -> none
-! CHECK:             omp.yield
-! CHECK:           ^bb3:
-! CHECK:             omp.yield
-! CHECK:           }
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         cf.br ^bb1
-! CHECK:       ^bb1:
-! CHECK:         return
-! CHECK:       }
-
-subroutine test_stop_in_region4()
-  integer :: x
-  !$omp do
-  do i = 1, 10
-    x = 3
-    if (x > 1) stop x
-  enddo
-  !$omp end do
-end
-
-
-!CHECK-LABEL: func.func @_QPtest_stop_in_region5
-!CHECK:   omp.parallel   {
-!CHECK:     {{.*}} fir.call @_FortranAStopStatement({{.*}}, {{.*}}, {{.*}}) fastmath<contract> : (i32, i1, i1) -> none
-!CHECK:     omp.terminator
-!CHECK:   }
-!CHECK:   return
-
-subroutine test_stop_in_region5()
-  !$omp parallel
-  block
-    stop 1
-  end block
-  !$omp end parallel
-end
-
-!CHECK-LABEL: func.func @_QPtest_stop_in_region6
-!CHECK:  omp.parallel   {
-!CHECK:    cf.cond_br %{{.*}}, ^[[BB1:.*]], ^[[BB2:.*]]
-!CHECK:  ^[[BB1]]:
-!CHECK:    {{.*}}fir.call @_FortranAStopStatement({{.*}}, {{.*}}, {{.*}}) fastmath<contract> : (i32, i1, i1) -> none
-!CHECK:    omp.terminator
-!CHECK:  ^[[BB2]]:
-!CHECK:    {{.*}}fir.call @_FortranAStopStatement({{.*}}, {{.*}}, {{.*}}) fastmath<contract> : (i32, i1, i1) -> none
-!CHECK:    omp.terminator
-!CHECK:  }
-!CHECK:  return
-
-subroutine test_stop_in_region6(x)
-  integer :: x
-  !$omp parallel
-  if (x .gt. 1) then
-    stop 1
-  else
-    stop 2
-  end if
-  !$omp end parallel
-end
diff --git a/flang/test/Lower/OpenMP/FIR/target.f90 b/flang/test/Lower/OpenMP/FIR/target.f90
deleted file mode 100644
index ea4b9240e9e5..000000000000
--- a/flang/test/Lower/OpenMP/FIR/target.f90
+++ /dev/null
@@ -1,554 +0,0 @@
-! The "thread_limit" clause was added to the "target" construct in OpenMP 5.1.
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s
-
-!===============================================================================
-! Target_Enter Simple
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_enter_simple() {
-subroutine omp_target_enter_simple
-   integer :: a(1024)
-   !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(to) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target_enter_data   map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target enter data map(to: a)
-end subroutine omp_target_enter_simple
-
-!===============================================================================
-! Target_Enter Map types
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_enter_mt() {
-subroutine omp_target_enter_mt
-   integer :: a(1024)
-   integer :: b(1024)
-   integer :: c(1024)
-   integer :: d(1024)
-   !CHECK: %[[BOUNDS_0:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_0:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(to) capture(ByRef) bounds(%[[BOUNDS_0]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: %[[BOUNDS_1:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_1:.*]] = omp.map.info var_ptr(%{{.*}})  map_clauses(to) capture(ByRef) bounds(%[[BOUNDS_1]]) -> !fir.ref<!fir.array<1024xi32>> {name = "b"}
-   !CHECK: %[[BOUNDS_2:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_2:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(always, exit_release_or_enter_alloc) capture(ByRef) bounds(%[[BOUNDS_2]]) -> !fir.ref<!fir.array<1024xi32>> {name = "c"}
-   !CHECK: %[[BOUNDS_3:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_3:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(to) capture(ByRef) bounds(%[[BOUNDS_3]]) -> !fir.ref<!fir.array<1024xi32>> {name = "d"}
-   !CHECK: omp.target_enter_data   map_entries(%[[MAP_0]], %[[MAP_1]], %[[MAP_2]], %[[MAP_3]] : !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>)
-   !$omp target enter data map(to: a, b) map(always, alloc: c) map(to: d)
-end subroutine omp_target_enter_mt
-
-!===============================================================================
-! `Nowait` clause
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_enter_nowait() {
-subroutine omp_target_enter_nowait
-   integer :: a(1024)
-   !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(to) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target_enter_data  nowait map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target enter data map(to: a) nowait
-end subroutine omp_target_enter_nowait
-
-!===============================================================================
-! `if` clause
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_enter_if() {
-subroutine omp_target_enter_if
-   integer :: a(1024)
-   integer :: i
-   i = 5
-   !CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_1:.*]] : !fir.ref<i32>
-   !CHECK: %[[VAL_4:.*]] = arith.constant 10 : i32
-   !CHECK: %[[VAL_5:.*]] = arith.cmpi slt, %[[VAL_3]], %[[VAL_4]] : i32
-   !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(to) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target_enter_data   if(%[[VAL_5]] : i1) map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target enter data if(i<10) map(to: a)
-end subroutine omp_target_enter_if
-
-!===============================================================================
-! `device` clause
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_enter_device() {
-subroutine omp_target_enter_device
-   integer :: a(1024)
-   !CHECK: %[[VAL_1:.*]] = arith.constant 2 : i32
-   !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(to) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target_enter_data   device(%[[VAL_1]] : i32) map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target enter data map(to: a) device(2)
-end subroutine omp_target_enter_device
-
-!===============================================================================
-! Target_Exit Simple
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_exit_simple() {
-subroutine omp_target_exit_simple
-   integer :: a(1024)
-   !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(from) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target_exit_data   map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target exit data map(from: a)
-end subroutine omp_target_exit_simple
-
-!===============================================================================
-! Target_Exit Map types
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_exit_mt() {
-subroutine omp_target_exit_mt
-   integer :: a(1024)
-   integer :: b(1024)
-   integer :: c(1024)
-   integer :: d(1024)
-   integer :: e(1024)
-   !CHECK: %[[BOUNDS_0:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_0:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(from) capture(ByRef) bounds(%[[BOUNDS_0]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: %[[BOUNDS_1:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_1:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(from) capture(ByRef) bounds(%[[BOUNDS_1]]) -> !fir.ref<!fir.array<1024xi32>> {name = "b"}
-   !CHECK: %[[BOUNDS_2:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_2:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%[[BOUNDS_2]]) -> !fir.ref<!fir.array<1024xi32>> {name = "c"}
-   !CHECK: %[[BOUNDS_3:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_3:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(always, delete) capture(ByRef) bounds(%[[BOUNDS_3]]) -> !fir.ref<!fir.array<1024xi32>> {name = "d"}
-   !CHECK: %[[BOUNDS_4:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_4:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(from) capture(ByRef) bounds(%[[BOUNDS_4]]) -> !fir.ref<!fir.array<1024xi32>> {name = "e"}
-   !CHECK: omp.target_exit_data map_entries(%[[MAP_0]], %[[MAP_1]], %[[MAP_2]], %[[MAP_3]], %[[MAP_4]] : !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>)
-   !$omp target exit data map(from: a,b) map(release: c) map(always, delete: d) map(from: e)
-end subroutine omp_target_exit_mt
-
-!===============================================================================
-! `device` clause
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_exit_device() {
-subroutine omp_target_exit_device
-   integer :: a(1024)
-   integer :: d
-   !CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_1:.*]] : !fir.ref<i32>
-   !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(from) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target_exit_data   device(%[[VAL_2]] : i32) map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target exit data map(from: a) device(d)
-end subroutine omp_target_exit_device
-
-!===============================================================================
-! Target_Update `to` clause
-!===============================================================================
-
-subroutine omp_target_update_to
-   integer :: a(1024)
-
-   !CHECK-DAG: %[[A_ALLOC:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_update_toEa"}
-   !CHECK-DAG: %[[BOUNDS:.*]] = omp.map.bounds
-
-   !CHECK: %[[TO_MAP:.*]] = omp.map.info var_ptr(%[[A_ALLOC]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)
-   !CHECK-SAME: map_clauses(to) capture(ByRef)
-   !CHECK-SAME: bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-
-   !CHECK: omp.target_update
-   !CHECK-SAME: motion_entries(%[[TO_MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target update to(a)
-end subroutine omp_target_update_to
-
-!===============================================================================
-! Target_Update `from` clause
-!===============================================================================
-
-subroutine omp_target_update_from
-   integer :: a(1024)
-
-   !CHECK-DAG: %[[A_ALLOC:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_update_fromEa"}
-   !CHECK-DAG: %[[BOUNDS:.*]] = omp.map.bounds
-
-   !CHECK: %[[FROM_MAP:.*]] = omp.map.info var_ptr(%[[A_ALLOC]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)
-   !CHECK-SAME: map_clauses(from) capture(ByRef)
-   !CHECK-SAME: bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-
-   !CHECK: omp.target_update
-   !CHECK-SAME: motion_entries(%[[FROM_MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target update from(a)
-end subroutine omp_target_update_from
-
-!===============================================================================
-! Target_Update `if` clause
-!===============================================================================
-
-subroutine omp_target_update_if
-   integer :: a(1024)
-   logical :: i
-
-   !CHECK-DAG: %[[A_ALLOC:.*]] = fir.alloca
-   !CHECK-DAG: %[[BOUNDS:.*]] = omp.map.bounds
-   !CHECK-DAG: %[[COND:.*]] = fir.convert %{{.*}} : (!fir.logical<4>) -> i1
-
-   !CHECK: %[[TO_MAP:.*]] = omp.map.info
-
-   !CHECK: omp.target_update if(%[[COND]] : i1)
-   !CHECK-SAME: motion_entries(%[[TO_MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target update to(a) if(i)
-end subroutine omp_target_update_if
-
-!===============================================================================
-! Target_Update `device` clause
-!===============================================================================
-
-subroutine omp_target_update_device
-   integer :: a(1024)
-
-   !CHECK-DAG: %[[A_ALLOC:.*]] = fir.alloca
-   !CHECK-DAG: %[[BOUNDS:.*]] = omp.map.bounds
-   !CHECK-DAG: %[[DEVICE:.*]] = arith.constant 1 : i32
-
-   !CHECK: %[[TO_MAP:.*]] = omp.map.info
-
-   !CHECK: omp.target_update
-   !CHECK-SAME: device(%[[DEVICE]] : i32)
-   !CHECK-SAME: motion_entries(%[[TO_MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target update to(a) device(1)
-end subroutine omp_target_update_device
-
-!===============================================================================
-! Target_Update `nowait` clause
-!===============================================================================
-
-subroutine omp_target_update_nowait
-   integer :: a(1024)
-
-   !CHECK-DAG: %[[A_ALLOC:.*]] = fir.alloca
-   !CHECK-DAG: %[[BOUNDS:.*]] = omp.map.bounds
-
-   !CHECK: %[[TO_MAP:.*]] = omp.map.info
-
-   !CHECK: omp.target_update
-   !CHECK-SAME: nowait
-   !CHECK-SAME: motion_entries(%[[TO_MAP]] : !fir.ref<!fir.array<1024xi32>>)
-   !$omp target update to(a) nowait
-end subroutine omp_target_update_nowait
-
-!===============================================================================
-! Target_Data with region
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_data() {
-subroutine omp_target_data
-   !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_dataEa"}
-   integer :: a(1024)
-   !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target_data   map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>) {
-   !$omp target data map(tofrom: a)
-      !CHECK: %[[VAL_1:.*]] = arith.constant 10 : i32
-      !CHECK: %[[VAL_2:.*]] = arith.constant 1 : i64
-      !CHECK: %[[VAL_3:.*]] = arith.constant 1 : i64
-      !CHECK: %[[VAL_4:.*]] = arith.subi %[[VAL_2]], %[[VAL_3]] : i64
-      !CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_4]] : (!fir.ref<!fir.array<1024xi32>>, i64) -> !fir.ref<i32>
-      !CHECK: fir.store %[[VAL_1]] to %[[VAL_5]] : !fir.ref<i32>
-      a(1) = 10
-   !CHECK: omp.terminator
-   !$omp end target data
-   !CHECK: }
-end subroutine omp_target_data
-
-!CHECK-LABEL: func.func @_QPomp_target_data_mt
-subroutine omp_target_data_mt
-   integer :: a(1024)
-   integer :: b(1024)
-   !CHECK: %[[VAR_A:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_data_mtEa"}
-   !CHECK: %[[VAR_B:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "b", uniq_name = "_QFomp_target_data_mtEb"}
-   !CHECK: %[[BOUNDS_A:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAR_A]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS_A]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target_data   map_entries(%[[MAP_A]] : !fir.ref<!fir.array<1024xi32>>) {
-   !$omp target data map(a)
-   !CHECK: omp.terminator
-   !$omp end target data
-   !CHECK: }
-   !CHECK: %[[BOUNDS_B:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_B:.*]] = omp.map.info var_ptr(%[[VAR_B]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(always, from) capture(ByRef) bounds(%[[BOUNDS_B]]) -> !fir.ref<!fir.array<1024xi32>> {name = "b"}
-   !CHECK: omp.target_data   map_entries(%[[MAP_B]] : !fir.ref<!fir.array<1024xi32>>) {
-   !$omp target data map(always, from : b)
-   !CHECK: omp.terminator
-   !$omp end target data
-   !CHECK: }
-end subroutine omp_target_data_mt
-
-!===============================================================================
-! Target with region
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target() {
-subroutine omp_target
-   !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_targetEa"}
-   integer :: a(1024)
-   !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target   map_entries(%[[MAP]] -> %[[ARG_0:.*]] : !fir.ref<!fir.array<1024xi32>>) {
-   !CHECK: ^bb0(%[[ARG_0]]: !fir.ref<!fir.array<1024xi32>>):
-   !$omp target map(tofrom: a)
-      !CHECK: %[[VAL_1:.*]] = arith.constant 10 : i32
-      !CHECK: %[[VAL_2:.*]] = arith.constant 1 : i64
-      !CHECK: %[[VAL_3:.*]] = arith.constant 1 : i64
-      !CHECK: %[[VAL_4:.*]] = arith.subi %[[VAL_2]], %[[VAL_3]] : i64
-      !CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[ARG_0]], %[[VAL_4]] : (!fir.ref<!fir.array<1024xi32>>, i64) -> !fir.ref<i32>
-      !CHECK: fir.store %[[VAL_1]] to %[[VAL_5]] : !fir.ref<i32>
-      a(1) = 10
-   !CHECK: omp.terminator
-   !$omp end target
-   !CHECK: }
-end subroutine omp_target
-
-!===============================================================================
-! Target implicit capture
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_implicit() {
-subroutine omp_target_implicit
-   !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_implicitEa"}
-   integer :: a(1024)
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(implicit, tofrom) capture(ByRef) bounds(%{{.*}}) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: omp.target   map_entries(%[[MAP]] -> %[[ARG_0:.*]] : !fir.ref<!fir.array<1024xi32>>) {
-   !CHECK: ^bb0(%[[ARG_0]]: !fir.ref<!fir.array<1024xi32>>):
-   !$omp target
-      !CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[ARG_0]], %{{.*}} : (!fir.ref<!fir.array<1024xi32>>, i64) -> !fir.ref<i32>
-      a(1) = 10
-   !CHECK: omp.terminator
-   !$omp end target
-   !CHECK: }
-end subroutine omp_target_implicit
-
-!===============================================================================
-! Target implicit capture nested
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_implicit_nested() {
-subroutine omp_target_implicit_nested
-   integer::a, b
-   !CHECK: omp.target   map_entries(%{{.*}} -> %[[ARG0:.*]], %{{.*}} -> %[[ARG1:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
-   !CHECK: ^bb0(%[[ARG0]]: !fir.ref<i32>, %[[ARG1]]: !fir.ref<i32>):
-   !$omp target
-      !CHECK: fir.store %{{.*}} to %[[ARG0]] : !fir.ref<i32>
-      a = 10
-      !$omp parallel
-         !CHECK: fir.store %{{.*}} to %[[ARG1]] : !fir.ref<i32>
-         b = 20
-         !CHECK: omp.terminator
-      !$omp end parallel
-   !CHECK: omp.terminator
-   !$omp end target
-   !CHECK: }
-end subroutine omp_target_implicit_nested
-
-!===============================================================================
-! Target implicit capture with bounds
-!===============================================================================
-
-
-!CHECK-LABEL: func.func @_QPomp_target_implicit_bounds(
-!CHECK: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
-subroutine omp_target_implicit_bounds(n)
-   !CHECK: %[[VAL_COPY:.*]] = fir.alloca i32
-   !CHECK: %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
-   !CHECK: fir.store %[[VAL_1]] to %[[VAL_COPY]] : !fir.ref<i32>
-   !CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i32) -> i64
-   !CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (i64) -> index
-   !CHECK: %[[VAL_4:.*]] = arith.constant 0 : index
-   !CHECK: %[[VAL_5:.*]] = arith.cmpi sgt, %[[VAL_3]], %[[VAL_4]] : index
-   !CHECK: %[[VAL_6:.*]] = arith.select %[[VAL_5]], %[[VAL_3]], %[[VAL_4]] : index
-   !CHECK: %[[VAL_7:.*]] = arith.constant 1024 : i64
-   !CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i64) -> index
-   !CHECK: %[[VAL_9:.*]] = arith.constant 0 : index
-   !CHECK: %[[VAL_10:.*]] = arith.cmpi sgt, %[[VAL_8]], %[[VAL_9]] : index
-   !CHECK: %[[VAL_11:.*]] = arith.select %[[VAL_10]], %[[VAL_8]], %[[VAL_9]] : index
-   !CHECK: %[[VAL_12:.*]] = fir.alloca !fir.array<?x1024xi32>, %[[VAL_6]] {bindc_name = "a", uniq_name = "_QFomp_target_implicit_boundsEa"}
-   integer :: n
-   integer :: a(n, 1024)
-   !CHECK: %[[VAL_13:.*]] = arith.constant 1 : index
-   !CHECK: %[[VAL_14:.*]] = arith.constant 0 : index
-   !CHECK: %[[VAL_15:.*]] = arith.subi %[[VAL_6]], %[[VAL_13]] : index
-   !CHECK: %[[VAL_16:.*]] = omp.map.bounds lower_bound(%[[VAL_14]] : index) upper_bound(%[[VAL_15]] : index) extent(%[[VAL_6]] : index) stride(%[[VAL_13]] : index) start_idx(%[[VAL_13]] : index)
-   !CHECK: %[[VAL_17:.*]] = arith.constant 0 : index
-   !CHECK: %[[VAL_18:.*]] = arith.subi %[[VAL_11]], %[[VAL_13]] : index
-   !CHECK: %[[VAL_19:.*]] = omp.map.bounds lower_bound(%[[VAL_17]] : index) upper_bound(%[[VAL_18]] : index) extent(%[[VAL_11]] : index) stride(%[[VAL_13]] : index) start_idx(%[[VAL_13]] : index)
-   !CHECK: %[[VAL_20:.*]] = omp.map.info var_ptr(%[[VAL_12]] : !fir.ref<!fir.array<?x1024xi32>>, !fir.array<?x1024xi32>) map_clauses(implicit, tofrom) capture(ByRef) bounds(%[[VAL_16]], %[[VAL_19]]) -> !fir.ref<!fir.array<?x1024xi32>> {name = "a"}
-   !CHECK: %[[VAL_21:.*]] = omp.map.info var_ptr(%[[VAL_COPY]] : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = ""}
-   !CHECK: omp.target map_entries(%[[VAL_20]] -> %[[VAL_22:.*]], %[[VAL_21]] -> %[[VAL_23:.*]] : !fir.ref<!fir.array<?x1024xi32>>, !fir.ref<i32>) {
-   !CHECK: ^bb0(%[[VAL_22]]: !fir.ref<!fir.array<?x1024xi32>>, %[[VAL_23]]: !fir.ref<i32>):
-   !$omp target
-      !CHECK: %[[VAL_24:.*]] = fir.load %[[VAL_23]] : !fir.ref<i32>
-      !CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (i32) -> i64
-      !CHECK: %[[VAL_26:.*]] = arith.constant 0 : index
-      !CHECK: %[[VAL_27:.*]] = fir.convert %[[VAL_25]] : (i64) -> index
-      !CHECK: %[[VAL_28:.*]] = arith.cmpi sgt, %[[VAL_27]], %[[VAL_26]] : index
-      !CHECK: %[[VAL_29:.*]] = arith.select %[[VAL_28]], %[[VAL_27]], %[[VAL_26]] : index
-      !CHECK: %[[VAL_30:.*]] = arith.constant 33 : i32
-      !CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_22]] : (!fir.ref<!fir.array<?x1024xi32>>) -> !fir.ref<!fir.array<?xi32>>
-      !CHECK: %[[VAL_32:.*]] = arith.constant 1 : index
-      !CHECK: %[[VAL_33:.*]] = arith.constant 0 : index
-      !CHECK: %[[VAL_34:.*]] = arith.constant 11 : i64
-      !CHECK: %[[VAL_35:.*]] = fir.convert %[[VAL_34]] : (i64) -> index
-      !CHECK: %[[VAL_36:.*]] = arith.subi %[[VAL_35]], %[[VAL_32]] : index
-      !CHECK: %[[VAL_37:.*]] = arith.muli %[[VAL_32]], %[[VAL_36]] : index
-      !CHECK: %[[VAL_38:.*]] = arith.addi %[[VAL_37]], %[[VAL_33]] : index
-      !CHECK: %[[VAL_39:.*]] = arith.muli %[[VAL_32]], %[[VAL_29]] : index
-      !CHECK: %[[VAL_40:.*]] = arith.constant 22 : i64
-      !CHECK: %[[VAL_41:.*]] = fir.convert %[[VAL_40]] : (i64) -> index
-      !CHECK: %[[VAL_42:.*]] = arith.subi %[[VAL_41]], %[[VAL_32]] : index
-      !CHECK: %[[VAL_43:.*]] = arith.muli %[[VAL_39]], %[[VAL_42]] : index
-      !CHECK: %[[VAL_44:.*]] = arith.addi %[[VAL_43]], %[[VAL_38]] : index
-      !CHECK: %[[VAL_45:.*]] = fir.coordinate_of %[[VAL_31]], %[[VAL_44]] : (!fir.ref<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-      !CHECK: fir.store %[[VAL_30]] to %[[VAL_45]] : !fir.ref<i32>
-      a(11, 22) = 33
-      !CHECK: omp.terminator
-   !$omp end target
-!CHECK: }
-end subroutine omp_target_implicit_bounds
-
-!===============================================================================
-! Target `thread_limit` clause
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_thread_limit() {
-subroutine omp_target_thread_limit
-   integer :: a
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "a"}
-   !CHECK: %[[VAL_1:.*]] = arith.constant 64 : i32
-   !CHECK: omp.target   thread_limit(%[[VAL_1]] : i32) map_entries(%[[MAP]] -> %[[ARG_0:.*]] : !fir.ref<i32>) {
-   !CHECK: ^bb0(%[[ARG_0]]: !fir.ref<i32>):
-   !$omp target map(tofrom: a) thread_limit(64)
-      a = 10
-   !CHECK: omp.terminator
-   !$omp end target
-   !CHECK: }
-end subroutine omp_target_thread_limit
-
-!===============================================================================
-! Target `use_device_ptr` clause
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_device_ptr() {
-subroutine omp_target_device_ptr
-   use iso_c_binding, only : c_ptr, c_loc
-   type(c_ptr) :: a
-   integer, target :: b
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) -> {{.*}} {name = "a"}
-   !CHECK: omp.target_data map_entries(%[[MAP]]{{.*}}
-   !$omp target data map(tofrom: a) use_device_ptr(a)
-   !CHECK: ^bb0(%[[VAL_1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>):
-   !CHECK: {{.*}} = fir.coordinate_of %[[VAL_1:.*]], {{.*}} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
-      a = c_loc(b)
-   !CHECK: omp.terminator
-   !$omp end target data
-   !CHECK: }
-end subroutine omp_target_device_ptr
-
- !===============================================================================
- ! Target `use_device_addr` clause
- !===============================================================================
-
- !CHECK-LABEL: func.func @_QPomp_target_device_addr() {
- subroutine omp_target_device_addr
-   integer, pointer :: a
-   !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "a", uniq_name = "_QFomp_target_device_addrEa"}
-   !CHECK: %[[MAP_MEMBERS:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.ptr<i32>>>, i32) var_ptr_ptr({{.*}} : !fir.llvm_ptr<!fir.ref<i32>>) map_clauses(tofrom) capture(ByRef) -> !fir.llvm_ptr<!fir.ref<i32>> {name = ""}
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.box<!fir.ptr<i32>>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBERS]] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.ptr<i32>>> {name = "a"}
-   !CHECK: omp.target_data map_entries(%[[MAP_MEMBERS]], %[[MAP]] : {{.*}}) use_device_addr(%[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<i32>>>) {
-   !$omp target data map(tofrom: a) use_device_addr(a)
-   !CHECK: ^bb0(%[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<i32>>>):
-   !CHECK: {{.*}} = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-      a = 10
-   !CHECK: omp.terminator
-   !$omp end target data
-   !CHECK: }
-end subroutine omp_target_device_addr
-
-!===============================================================================
-! Target with parallel loop
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_parallel_do() {
-subroutine omp_target_parallel_do
-   !CHECK: %[[C1024:.*]] = arith.constant 1024 : index
-   !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_parallel_doEa"}
-   integer :: a(1024)
-   !CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_target_parallel_doEi"}
-   integer :: i
-   !CHECK: %[[C1:.*]] = arith.constant 1 : index
-   !CHECK: %[[C0:.*]] = arith.constant 0 : index
-   !CHECK: %[[SUB:.*]] = arith.subi %[[C1024]], %[[C1]] : index
-   !CHECK: %[[BOUNDS:.*]] = omp.map.bounds   lower_bound(%[[C0]] : index) upper_bound(%[[SUB]] : index) extent(%[[C1024]] : index) stride(%[[C1]] : index) start_idx(%[[C1]] : index)
-   !CHECK: %[[MAP1:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
-   !CHECK: %[[MAP2:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !fir.ref<i32>, i32)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "i"}
-   !CHECK: omp.target map_entries(%[[MAP1]] -> %[[VAL_2:.*]], %[[MAP2]] -> %[[VAL_3:.*]] : !fir.ref<!fir.array<1024xi32>>, !fir.ref<i32>) {
-   !CHECK: ^bb0(%[[VAL_2]]: !fir.ref<!fir.array<1024xi32>>, %[[VAL_3]]: !fir.ref<i32>):
-      !CHECK-NEXT: omp.parallel
-      !$omp target parallel do map(tofrom: a)
-         !CHECK: %[[VAL_4:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-         !CHECK: %[[VAL_5:.*]] = arith.constant 1 : i32
-         !CHECK: %[[VAL_6:.*]] = arith.constant 1024 : i32
-         !CHECK: %[[VAL_7:.*]] = arith.constant 1 : i32
-         !CHECK: omp.wsloop {
-         !CHECK: omp.loop_nest (%[[VAL_8:.*]]) : i32 = (%[[VAL_5]]) to (%[[VAL_6]]) inclusive step (%[[VAL_7]]) {
-         !CHECK: fir.store %[[VAL_8]] to %[[VAL_4]] : !fir.ref<i32>
-         !CHECK: %[[VAL_9:.*]] = arith.constant 10 : i32
-         !CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_4]] : !fir.ref<i32>
-         !CHECK: %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i32) -> i64
-         !CHECK: %[[VAL_12:.*]] = arith.constant 1 : i64
-         !CHECK: %[[VAL_13:.*]] = arith.subi %[[VAL_11]], %[[VAL_12]] : i64
-         !CHECK: %[[VAL_14:.*]] = fir.coordinate_of %[[VAL_2]], %[[VAL_13]] : (!fir.ref<!fir.array<1024xi32>>, i64) -> !fir.ref<i32>
-         !CHECK: fir.store %[[VAL_9]] to %[[VAL_14]] : !fir.ref<i32>
-         do i = 1, 1024
-            a(i) = 10
-         end do
-         !CHECK: omp.yield
-         !CHECK: }
-         !CHECK: omp.terminator
-         !CHECK: }
-      !CHECK: omp.terminator
-      !CHECK: }
-   !CHECK: omp.terminator
-   !CHECK: }
-   !$omp end target parallel do
-end subroutine omp_target_parallel_do
-
-!===============================================================================
-! Target `is_device_ptr` clause
-!===============================================================================
-
-!CHECK-LABEL: func.func @_QPomp_target_is_device_ptr() {
-subroutine omp_target_is_device_ptr
-   use iso_c_binding, only : c_ptr, c_loc
-   !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> {bindc_name = "a", uniq_name = "_QFomp_target_is_device_ptrEa"}
-   type(c_ptr) :: a
-   !CHECK: %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "b", fir.target, uniq_name = "_QFomp_target_is_device_ptrEb"}
-   integer, target :: b
-   !CHECK: %[[MAP_0:.*]] = omp.map.info var_ptr(%[[DEV_PTR:.*]] : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>) map_clauses(tofrom) capture(ByRef) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {name = "a"}
-   !CHECK: %[[MAP_1:.*]] = omp.map.info var_ptr(%[[VAL_0:.*]] : !fir.ref<i32>, i32)   map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "b"}
-   !CHECK: %[[MAP_2:.*]] = omp.map.info var_ptr(%[[DEV_PTR:.*]] : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {name = "a"}
-   !CHECK: omp.target is_device_ptr(%[[DEV_PTR:.*]] : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) map_entries(%[[MAP_0:.*]] -> %[[ARG0:.*]], %[[MAP_1:.*]] -> %[[ARG1:.*]], %[[MAP_2:.*]] -> %[[ARG2:.*]] : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<i32>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) {
-   !CHECK: ^bb0(%[[ARG0]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %[[ARG1]]: !fir.ref<i32>, %[[ARG2]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>):
-   !$omp target map(tofrom: a,b) is_device_ptr(a)
-      !CHECK: {{.*}} = fir.coordinate_of %[[VAL_0:.*]], {{.*}} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
-      a = c_loc(b)
-   !CHECK: omp.terminator
-   !$omp end target
-   !CHECK: }
-end subroutine omp_target_is_device_ptr
-
- !===============================================================================
- ! Target `has_device_addr` clause
- !===============================================================================
-
- !CHECK-LABEL: func.func @_QPomp_target_has_device_addr() {
- subroutine omp_target_has_device_addr
-   !CHECK: %[[VAL_0:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "a", uniq_name = "_QFomp_target_has_device_addrEa"}
-   integer, pointer :: a
-   !CHECK: omp.target has_device_addr(%[[VAL_0:.*]] : !fir.ref<!fir.box<!fir.ptr<i32>>>) map_entries({{.*}} -> {{.*}}, {{.*}} -> {{.*}} : !fir.llvm_ptr<!fir.ref<i32>>, !fir.ref<!fir.box<!fir.ptr<i32>>>) {
-   !$omp target has_device_addr(a)
-   !CHECK: {{.*}} = fir.load %[[VAL_0:.*]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-      a = 10
-   !CHECK: omp.terminator
-   !$omp end target
-   !CHECK: }
-end subroutine omp_target_has_device_addr
diff --git a/flang/test/Lower/OpenMP/FIR/target_cpu_features.f90 b/flang/test/Lower/OpenMP/FIR/target_cpu_features.f90
deleted file mode 100644
index 5154782e1ae1..000000000000
--- a/flang/test/Lower/OpenMP/FIR/target_cpu_features.f90
+++ /dev/null
@@ -1,19 +0,0 @@
-!REQUIRES: amdgpu-registered-target, nvptx-registered-target
-!RUN: %flang_fc1 -emit-fir -triple amdgcn-amd-amdhsa -target-cpu gfx908 -fopenmp -fopenmp-is-target-device %s -o - | FileCheck --check-prefix=AMDGCN %s
-!RUN: %flang_fc1 -emit-fir -triple nvptx64-nvidia-cuda -target-cpu sm_80 -fopenmp -fopenmp-is-target-device %s -o - | FileCheck --check-prefix=NVPTX %s
-
-!===============================================================================
-! Target_Enter Simple
-!===============================================================================
-
-!AMDGCN: module attributes {
-!AMDGCN-SAME: fir.target_cpu = "gfx908"
-!AMDGCN-SAME: fir.target_features = #llvm.target_features<["+16-bit-insts", "+ci-insts",
-!AMDGCN-SAME: "+dl-insts", "+dot1-insts", "+dot10-insts", "+dot2-insts", "+dot3-insts",
-!AMDGCN-SAME: "+dot4-insts", "+dot5-insts", "+dot6-insts", "+dot7-insts", "+dpp",
-!AMDGCN-SAME: "+gfx8-insts", "+gfx9-insts", "+gws", "+image-insts", "+mai-insts",
-!AMDGCN-SAME: "+s-memrealtime", "+s-memtime-inst", "+wavefrontsize64"]>
-
-!NVPTX: module attributes {
-!NVPTX-SAME: fir.target_cpu = "sm_80"
-!NVPTX-SAME: fir.target_features = #llvm.target_features<["+ptx61", "+sm_80"]>
diff --git a/flang/test/Lower/OpenMP/FIR/task.f90 b/flang/test/Lower/OpenMP/FIR/task.f90
deleted file mode 100644
index 012ac757d304..000000000000
--- a/flang/test/Lower/OpenMP/FIR/task.f90
+++ /dev/null
@@ -1,237 +0,0 @@
-! REQUIRES: openmp_runtime
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-!CHECK-LABEL: func @_QPomp_task_simple() {
-subroutine omp_task_simple
-  !CHECK: omp.task {
-  !$omp task
-  !CHECK: fir.call @_QPfoo() {{.*}}: () -> ()
-  call foo()
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine omp_task_simple
-
-!===============================================================================
-! `if` clause
-!===============================================================================
-
-!CHECK-LABEL: func @_QPomp_task_if(%{{.+}}) {
-subroutine omp_task_if(bar)
-  logical, intent(inout) :: bar
-  !CHECK: omp.task if(%{{.+}}) {
-  !$omp task if(bar)
-  !CHECK: fir.call @_QPfoo() {{.*}}: () -> ()
-  call foo()
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine omp_task_if
-
-!===============================================================================
-! `final` clause
-!===============================================================================
-
-!CHECK-LABEL: func @_QPomp_task_final(%{{.+}}) {
-subroutine omp_task_final(bar)
-  logical, intent(inout) :: bar
-  !CHECK: omp.task final(%{{.+}}) {
-  !$omp task final(bar)
-  !CHECK: fir.call @_QPfoo() {{.*}}: () -> ()
-  call foo()
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine omp_task_final
-
-!===============================================================================
-! `priority` clause
-!===============================================================================
-
-!CHECK-LABEL: func @_QPomp_task_priority(%{{.+}}) {
-subroutine omp_task_priority(bar)
-  integer, intent(inout) :: bar
-  !CHECK: omp.task priority(%{{.+}}) {
-  !$omp task priority(bar)
-  !CHECK: fir.call @_QPfoo() {{.*}}: () -> ()
-  call foo()
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine omp_task_priority
-
-!===============================================================================
-! `allocate` clause
-!===============================================================================
-
-!CHECK-LABEL: func @_QPtask_allocate
-subroutine task_allocate()
-  use omp_lib
-  integer :: x
-  !CHECK: omp.task allocate(%{{.+}} : i64 -> %{{.+}} : !fir.ref<i32>) {
-  !$omp task allocate(omp_high_bw_mem_alloc: x) private(x)
-  !CHECK: arith.addi
-  x = x + 12
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine task_allocate
-
-!===============================================================================
-! `depend` clause
-!===============================================================================
-
-!CHECK-LABEL: func @_QPtask_depend
-subroutine task_depend()
-  integer :: x
-  !CHECK: omp.task depend(taskdependin -> %{{.+}} : !fir.ref<i32>) {
-  !$omp task depend(in : x)
-  !CHECK: arith.addi
-  x = x + 12
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine task_depend
-
-!CHECK-LABEL: func @_QPtask_depend_non_int
-subroutine task_depend_non_int()
-  character(len = 15) :: x
-  integer, allocatable :: y
-  complex :: z
-  !CHECK: omp.task depend(taskdependin -> %{{.+}} : !fir.ref<!fir.char<1,15>>, taskdependin -> %{{.+}} : !fir.ref<!fir.box<!fir.heap<i32>>>, taskdependin ->  %{{.+}} : !fir.ref<!fir.complex<4>>) {
-  !$omp task depend(in : x, y, z)
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine task_depend_non_int
-
-!CHECK-LABEL: func @_QPtask_depend_all_kinds_one_task
-subroutine task_depend_all_kinds_one_task()
-  integer :: x
-  !CHECK: omp.task depend(taskdependin -> %{{.+}} : !fir.ref<i32>, taskdependout -> %{{.+}} : !fir.ref<i32>, taskdependinout -> %{{.+}} : !fir.ref<i32>) {
-  !$omp task depend(in : x) depend(out : x) depend(inout : x)
-  !CHECK: arith.addi
-  x = x + 12
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine task_depend_all_kinds_one_task
-
-!CHECK-LABEL: func @_QPtask_depend_multi_var
-subroutine task_depend_multi_var()
-  integer :: x
-  integer :: y
-  !CHECK: omp.task depend(taskdependin -> %{{.*}} : !fir.ref<i32>, taskdependin -> %{{.+}} : !fir.ref<i32>) {
-  !$omp task depend(in :x,y)
-  !CHECK: arith.addi
-  x = x + 12
-  y = y + 12
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine task_depend_multi_var
-
-!CHECK-LABEL: func @_QPtask_depend_multi_task
-subroutine task_depend_multi_task()
-  integer :: x
-  !CHECK: omp.task depend(taskdependout -> %{{.+}} : !fir.ref<i32>)
-  !$omp task depend(out : x)
-  !CHECK: arith.addi
-  x = x + 12
-  !CHECK: omp.terminator
-  !$omp end task
-  !CHECK: omp.task depend(taskdependinout -> %{{.+}} : !fir.ref<i32>)
-  !$omp task depend(inout : x)
-  !CHECK: arith.addi
-  x = x + 12
-  !CHECK: omp.terminator
-  !$omp end task
-  !CHECK: omp.task depend(taskdependin -> %{{.+}} : !fir.ref<i32>)
-  !$omp task depend(in : x)
-  !CHECK: arith.addi
-  x = x + 12
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine task_depend_multi_task
-
-!===============================================================================
-! `private` clause
-!===============================================================================
-!CHECK-LABEL: func @_QPtask_private
-subroutine task_private
-  type mytype
-  integer :: x
-  end type mytype
-
-  !CHECK: %[[int_var:.+]] = fir.alloca i32
-  !CHECK: %[[mytype_var:.+]] = fir.alloca !fir.type<_QFtask_privateTmytype{x:i32}>
-  integer :: int_var
-  type(mytype) :: mytype_var
-
-  !CHECK: fir.call @_QPbar(%[[int_var]], %[[mytype_var]]) {{.*}}: (!fir.ref<i32>, !fir.ref<!fir.type<_QFtask_privateTmytype{x:i32}>>) -> ()
-  call bar(int_var, mytype_var)
-
-  !CHECK: omp.task {
-  !$omp task private(int_var, mytype_var)
-  !CHECK: %[[int_var_private:.+]] = fir.alloca i32
-  !CHECK: %[[mytype_var_private:.+]] = fir.alloca !fir.type<_QFtask_privateTmytype{x:i32}>
-
-  !CHECK: fir.call @_QPbar(%[[int_var_private]], %[[mytype_var_private]]) {{.*}}: (!fir.ref<i32>, !fir.ref<!fir.type<_QFtask_privateTmytype{x:i32}>>) -> ()
-  call bar(int_var, mytype_var)
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine task_private
-
-!===============================================================================
-! `firstprivate` clause
-!===============================================================================
-!CHECK-LABEL: func @_QPtask_firstprivate
-subroutine task_firstprivate
-  type mytype
-  integer :: x
-  end type mytype
-
-  !CHECK: %[[int_var:.+]] = fir.alloca i32
-  !CHECK: %[[mytype_var:.+]] = fir.alloca !fir.type<_QFtask_firstprivateTmytype{x:i32}>
-  integer :: int_var
-  type(mytype) :: mytype_var
-
-  !CHECK: fir.call @_QPbaz(%[[int_var]], %[[mytype_var]]) {{.*}}: (!fir.ref<i32>, !fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>) -> ()
-  call baz(int_var, mytype_var)
-
-  !CHECK: omp.task {
-  !$omp task firstprivate(int_var, mytype_var)
-  !CHECK: %[[int_var_firstprivate:.+]] = fir.alloca i32
-  !CHECK: %[[int_var_load:.+]] = fir.load %[[int_var]] : !fir.ref<i32>
-  !CHECK: fir.store %[[int_var_load]] to %[[int_var_firstprivate]] : !fir.ref<i32>
-  !CHECK: %[[mytype_var_firstprivate:.+]] = fir.alloca !fir.type<_QFtask_firstprivateTmytype{x:i32}>
-  !CHECK: %[[mytype_var_load:.+]] = fir.load %[[mytype_var]] : !fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>
-  !CHECK: fir.store %[[mytype_var_load]] to %[[mytype_var_firstprivate]]
-  !CHECK: fir.call @_QPbaz(%[[int_var_firstprivate]], %[[mytype_var_firstprivate]]) {{.*}}: (!fir.ref<i32>, !fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>) -> ()
-  call baz(int_var, mytype_var)
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine task_firstprivate
-
-!===============================================================================
-! Multiple clauses
-!===============================================================================
-
-!CHECK-LABEL: func @_QPtask_multiple_clauses
-subroutine task_multiple_clauses()
-  use omp_lib
-
-  !CHECK: %[[x:.+]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFtask_multiple_clausesEx"}
-  !CHECK: %[[y:.+]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFtask_multiple_clausesEy"}
-  !CHECK: %[[z:.+]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFtask_multiple_clausesEz"}
-  integer :: x, y, z
-  logical :: buzz
-
-  !CHECK: omp.task if(%{{.+}}) final(%{{.+}}) priority(%{{.+}}) allocate(%{{.+}} : i64 -> %{{.+}} : !fir.ref<i32>) {
-  !$omp task if(buzz) final(buzz) priority(z) allocate(omp_high_bw_mem_alloc: x) private(x) firstprivate(y)
-
-  !CHECK: %[[x_priv:.+]] = fir.alloca i32
-  !CHECK: %[[y_priv:.+]] = fir.alloca i32
-  !CHECK: %[[y_load:.+]] = fir.load %[[y]] : !fir.ref<i32>
-  !CHECK: fir.store %[[y_load]] to %[[y_priv]] : !fir.ref<i32>
-
-  !CHECK: arith.addi
-  x = x + 12
-  !CHECK: arith.subi
-  y = y - 12
-
-  !CHECK: omp.terminator
-  !$omp end task
-end subroutine task_multiple_clauses
diff --git a/flang/test/Lower/OpenMP/FIR/taskgroup.f90 b/flang/test/Lower/OpenMP/FIR/taskgroup.f90
deleted file mode 100644
index 78b9da8e9b09..000000000000
--- a/flang/test/Lower/OpenMP/FIR/taskgroup.f90
+++ /dev/null
@@ -1,21 +0,0 @@
-! REQUIRES: openmp_runtime
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-!CHECK-LABEL: @_QPomp_taskgroup
-subroutine omp_taskgroup
-use omp_lib
-integer :: allocated_x
-!CHECK-DAG: %{{.*}} = fir.alloca i32 {bindc_name = "allocated_x", uniq_name = "_QFomp_taskgroupEallocated_x"}
-!CHECK-DAG: %{{.*}} = arith.constant 4 : i64
-
-!CHECK: omp.taskgroup  allocate(%{{.*}} : i64 -> %0 : !fir.ref<i32>)
-!$omp taskgroup allocate(omp_high_bw_mem_alloc: allocated_x)
-!$omp task
-!CHECK: fir.call @_QPwork() {{.*}}: () -> ()
-   call work()
-!CHECK: omp.terminator
-!$omp end task
-!CHECK: omp.terminator
-!$omp end taskgroup
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/taskwait.f90 b/flang/test/Lower/OpenMP/FIR/taskwait.f90
deleted file mode 100644
index eed4f1b84a22..000000000000
--- a/flang/test/Lower/OpenMP/FIR/taskwait.f90
+++ /dev/null
@@ -1,12 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s --check-prefixes="FIRDialect,OMPDialect"
-!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefixes="OMPDialect"
-
-!FIRDialect-LABEL: @_QPomp_taskwait
-subroutine omp_taskwait
-  !OMPDialect: omp.taskwait
-  !$omp taskwait
-  !FIRDialect: fir.call @_QPfoo() {{.*}}: () -> ()
-  call foo()
-  !OMPDialect: omp.taskwait
-  !$omp taskwait
-end subroutine omp_taskwait
diff --git a/flang/test/Lower/OpenMP/FIR/taskyield.f90 b/flang/test/Lower/OpenMP/FIR/taskyield.f90
deleted file mode 100644
index ca0bc1d071df..000000000000
--- a/flang/test/Lower/OpenMP/FIR/taskyield.f90
+++ /dev/null
@@ -1,12 +0,0 @@
-!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s --check-prefixes="FIRDialect,OMPDialect"
-!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefixes="OMPDialect"
-
-!FIRDialect-LABEL: @_QPomp_taskyield
-subroutine omp_taskyield
-  !OMPDialect: omp.taskyield
-  !$omp taskyield
-  !FIRDialect: fir.call @_QPfoo() {{.*}}: () -> ()
-  call foo()
-  !OMPDialect: omp.taskyield
-  !$omp taskyield
-end subroutine omp_taskyield
diff --git a/flang/test/Lower/OpenMP/FIR/teams.f90 b/flang/test/Lower/OpenMP/FIR/teams.f90
deleted file mode 100644
index 9c0593a24f2d..000000000000
--- a/flang/test/Lower/OpenMP/FIR/teams.f90
+++ /dev/null
@@ -1,117 +0,0 @@
-! REQUIRES: openmp_runtime
-
-! RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s
-
-! CHECK-LABEL: func @_QPteams_simple
-subroutine teams_simple()
-  ! CHECK: omp.teams
-  !$omp teams
-  ! CHECK: fir.call
-  call f1()
-  ! CHECK: omp.terminator
-  !$omp end teams
-end subroutine teams_simple
-
-!===============================================================================
-! `num_teams` clause
-!===============================================================================
-
-! CHECK-LABEL: func @_QPteams_numteams
-subroutine teams_numteams(num_teams)
-  integer, intent(inout) :: num_teams
-
-  ! CHECK: omp.teams
-  ! CHECK-SAME: num_teams( to %{{.*}}: i32)
-  !$omp teams num_teams(4)
-  ! CHECK: fir.call
-  call f1()
-  ! CHECK: omp.terminator
-  !$omp end teams
-
-  ! CHECK: omp.teams
-  ! CHECK-SAME: num_teams( to %{{.*}}: i32)
-  !$omp teams num_teams(num_teams)
-  ! CHECK: fir.call
-  call f2()
-  ! CHECK: omp.terminator
-  !$omp end teams
-
-end subroutine teams_numteams
-
-!===============================================================================
-! `if` clause
-!===============================================================================
-
-! CHECK-LABEL: func @_QPteams_if
-subroutine teams_if(alpha)
-  integer, intent(in) :: alpha
-  logical :: condition
-
-  ! CHECK: omp.teams
-  ! CHECK-SAME: if(%{{.*}})
-  !$omp teams if(.false.)
-  ! CHECK: fir.call
-  call f1()
-  ! CHECK: omp.terminator
-  !$omp end teams
-
-  ! CHECK: omp.teams
-  ! CHECK-SAME: if(%{{.*}})
-  !$omp teams if(alpha .le. 0)
-  ! CHECK: fir.call
-  call f2()
-  ! CHECK: omp.terminator
-  !$omp end teams
-
-  ! CHECK: omp.teams
-  ! CHECK-SAME: if(%{{.*}})
-  !$omp teams if(condition)
-  ! CHECK: fir.call
-  call f3()
-  ! CHECK: omp.terminator
-  !$omp end teams
-end subroutine teams_if
-
-!===============================================================================
-! `thread_limit` clause
-!===============================================================================
-
-! CHECK-LABEL: func @_QPteams_threadlimit
-subroutine teams_threadlimit(thread_limit)
-  integer, intent(inout) :: thread_limit
-
-  ! CHECK: omp.teams
-  ! CHECK-SAME: thread_limit(%{{.*}}: i32)
-  !$omp teams thread_limit(4)
-  ! CHECK: fir.call
-  call f1()
-  ! CHECK: omp.terminator
-  !$omp end teams
-
-  ! CHECK: omp.teams
-  ! CHECK-SAME: thread_limit(%{{.*}}: i32)
-  !$omp teams thread_limit(thread_limit)
-  ! CHECK: fir.call
-  call f2()
-  ! CHECK: omp.terminator
-  !$omp end teams
-
-end subroutine teams_threadlimit
-
-!===============================================================================
-! `allocate` clause
-!===============================================================================
-
-! CHECK-LABEL: func @_QPteams_allocate
-subroutine teams_allocate()
-   use omp_lib
-   integer :: x
-   integer :: y
-   ! CHECK: omp.teams
-   ! CHECK-SAME: allocate(%{{.+}} : i64 -> %{{.+}} : !fir.ref<i32>)
-   !$omp teams allocate(omp_high_bw_mem_alloc: x) private(x)
-   ! CHECK: arith.addi
-   x = x + 12
-   ! CHECK: omp.terminator
-   !$omp end teams
-end subroutine teams_allocate
diff --git a/flang/test/Lower/OpenMP/FIR/threadprivate-char-array-chararray.f90 b/flang/test/Lower/OpenMP/FIR/threadprivate-char-array-chararray.f90
deleted file mode 100644
index 3580add37ef4..000000000000
--- a/flang/test/Lower/OpenMP/FIR/threadprivate-char-array-chararray.f90
+++ /dev/null
@@ -1,46 +0,0 @@
-! This test checks lowering of OpenMP Threadprivate Directive.
-! Test for character, array, and character array.
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-module test
-  character :: x
-  integer :: y(5)
-  character(5) :: z(5)
-
-  !$omp threadprivate(x, y, z)
-
-!CHECK-DAG: fir.global @_QMtestEx : !fir.char<1> {
-!CHECK-DAG: fir.global @_QMtestEy : !fir.array<5xi32> {
-!CHECK-DAG: fir.global @_QMtestEz : !fir.array<5x!fir.char<1,5>> {
-
-contains
-  subroutine sub()
-!CHECK-DAG:  [[ADDR0:%.*]] = fir.address_of(@_QMtestEx) : !fir.ref<!fir.char<1>>
-!CHECK-DAG:  [[NEWADDR0:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.char<1>> -> !fir.ref<!fir.char<1>>
-!CHECK-DAG:  [[ADDR1:%.*]] = fir.address_of(@_QMtestEy) : !fir.ref<!fir.array<5xi32>>
-!CHECK-DAG:  [[NEWADDR1:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<!fir.array<5xi32>> -> !fir.ref<!fir.array<5xi32>>
-!CHECK-DAG:  [[ADDR2:%.*]] = fir.address_of(@_QMtestEz) : !fir.ref<!fir.array<5x!fir.char<1,5>>>
-!CHECK-DAG:  [[NEWADDR2:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<!fir.array<5x!fir.char<1,5>>> -> !fir.ref<!fir.array<5x!fir.char<1,5>>>
-!CHECK-DAG:  %{{.*}} = fir.convert [[NEWADDR0]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<i8>
-!CHECK-DAG:  %{{.*}} = fir.embox [[NEWADDR1]](%{{.*}}) : (!fir.ref<!fir.array<5xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<5xi32>>
-!CHECK-DAG:  %{{.*}} = fir.embox [[NEWADDR2]](%{{.*}}) : (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.box<!fir.array<5x!fir.char<1,5>>>
-    print *, x, y, z
-
-    !$omp parallel
-!CHECK-DAG:    [[ADDR33:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.char<1>> -> !fir.ref<!fir.char<1>>
-!CHECK-DAG:    [[ADDR34:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<!fir.array<5xi32>> -> !fir.ref<!fir.array<5xi32>>
-!CHECK-DAG:    [[ADDR35:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<!fir.array<5x!fir.char<1,5>>> -> !fir.ref<!fir.array<5x!fir.char<1,5>>>
-!CHECK-DAG:    %{{.*}} = fir.convert [[ADDR33]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<i8>
-!CHECK-DAG:    %{{.*}} = fir.embox [[ADDR34]](%{{.*}}) : (!fir.ref<!fir.array<5xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<5xi32>>
-!CHECK-DAG:    %{{.*}} = fir.embox [[ADDR35]](%{{.*}}) : (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.box<!fir.array<5x!fir.char<1,5>>>
-      print *, x, y, z
-    !$omp end parallel
-
-!CHECK-DAG:  %{{.*}} = fir.convert [[NEWADDR0]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<i8>
-!CHECK-DAG:  %{{.*}} = fir.embox [[NEWADDR1]](%{{.*}}) : (!fir.ref<!fir.array<5xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<5xi32>>
-!CHECK-DAG:  %{{.*}} = fir.embox [[NEWADDR2]](%{{.*}}) : (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.box<!fir.array<5x!fir.char<1,5>>>
-    print *, x, y, z
-
-  end
-end
diff --git a/flang/test/Lower/OpenMP/FIR/threadprivate-commonblock.f90 b/flang/test/Lower/OpenMP/FIR/threadprivate-commonblock.f90
deleted file mode 100644
index 49f592ec8121..000000000000
--- a/flang/test/Lower/OpenMP/FIR/threadprivate-commonblock.f90
+++ /dev/null
@@ -1,91 +0,0 @@
-! This test checks lowering of OpenMP Threadprivate Directive.
-! Test for common block.
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-module test
-  integer:: a
-  real :: b(2)
-  complex, pointer :: c, d(:)
-  character(5) :: e, f(2)
-  common /blk/ a, b, c, d, e, f
-
-  !$omp threadprivate(/blk/)
-
-!CHECK: fir.global common @blk_(dense<0> : vector<103xi8>) : !fir.array<103xi8>
-
-contains
-  subroutine sub()
-!CHECK:  [[ADDR0:%.*]] = fir.address_of(@blk_) : !fir.ref<!fir.array<103xi8>>
-!CHECK:  [[NEWADDR0:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.array<103xi8>> -> !fir.ref<!fir.array<103xi8>>
-!CHECK-DAG:  [[ADDR1:%.*]] = fir.convert [[NEWADDR0]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:  [[C0:%.*]] = arith.constant 0 : index
-!CHECK-DAG:  [[ADDR2:%.*]] = fir.coordinate_of [[ADDR1]], [[C0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:  [[ADDR3:%.*]] = fir.convert [[ADDR2]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK-DAG:  [[ADDR4:%.*]] = fir.convert [[NEWADDR0]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:  [[C1:%.*]] = arith.constant 4 : index
-!CHECK-DAG:  [[ADDR5:%.*]] = fir.coordinate_of [[ADDR4]], [[C1]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:  [[ADDR6:%.*]] = fir.convert [[ADDR5]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<2xf32>>
-!CHECK-DAG:  [[ADDR7:%.*]] = fir.convert [[NEWADDR0]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:  [[C2:%.*]] = arith.constant 16 : index
-!CHECK-DAG:  [[ADDR8:%.*]] = fir.coordinate_of [[ADDR7]], [[C2]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:  [[ADDR9:%.*]] = fir.convert [[ADDR8]] : (!fir.ref<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
-!CHECK-DAG:  [[ADDR10:%.*]] = fir.convert [[NEWADDR0]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:  [[C3:%.*]] = arith.constant 40 : index
-!CHECK-DAG:  [[ADDR11:%.*]] = fir.coordinate_of [[ADDR10]], [[C3]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:  [[ADDR12:%.*]] = fir.convert [[ADDR11]] : (!fir.ref<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.complex<4>>>>>
-!CHECK-DAG:  [[ADDR13:%.*]] = fir.convert [[NEWADDR0]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:  [[C4:%.*]] = arith.constant 88 : index
-!CHECK-DAG:  [[ADDR14:%.*]] = fir.coordinate_of [[ADDR13]], [[C4]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:  [[ADDR15:%.*]] = fir.convert [[ADDR14]] : (!fir.ref<i8>) -> !fir.ref<!fir.char<1,5>>
-!CHECK-DAG:  [[ADDR16:%.*]] = fir.convert [[NEWADDR0]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:  [[C5:%.*]] = arith.constant 93 : index
-!CHECK-DAG:  [[ADDR17:%.*]] = fir.coordinate_of [[ADDR16]], [[C5]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:  [[ADDR18:%.*]] = fir.convert [[ADDR17]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<2x!fir.char<1,5>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[ADDR3]] : !fir.ref<i32>
-!CHECK-DAG:  %{{.*}} = fir.embox [[ADDR6]](%{{.*}}) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
-!CHECK-DAG:  %{{.*}} = fir.load [[ADDR9]] : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[ADDR12]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.complex<4>>>>>
-!CHECK-DAG:  %{{.*}} = fir.convert [[ADDR15]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
-!CHECK-DAG:  %{{.*}} = fir.embox [[ADDR18]](%{{.*}}) : (!fir.ref<!fir.array<2x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1,5>>>
-    print *, a, b, c, d, e, f
-
-    !$omp parallel
-!CHECK:    [[ADDR77:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.array<103xi8>> -> !fir.ref<!fir.array<103xi8>>
-!CHECK-DAG:    [[ADDR78:%.*]] = fir.convert [[ADDR77]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:    [[ADDR79:%.*]] = fir.coordinate_of [[ADDR78]], [[C0:%.*]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR80:%.*]] = fir.convert [[ADDR79:%.*]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK-DAG:    [[ADDR81:%.*]] = fir.convert [[ADDR77]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:    [[ADDR82:%.*]] = fir.coordinate_of [[ADDR81]], [[C1:%.*]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR83:%.*]] = fir.convert [[ADDR82:%.*]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<2xf32>>
-!CHECK-DAG:    [[ADDR84:%.*]] = fir.convert [[ADDR77]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:    [[ADDR85:%.*]] = fir.coordinate_of [[ADDR84]], [[C2:%.*]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR86:%.*]] = fir.convert [[ADDR85:%.*]] : (!fir.ref<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
-!CHECK-DAG:    [[ADDR87:%.*]] = fir.convert [[ADDR77]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:    [[ADDR88:%.*]] = fir.coordinate_of [[ADDR87]], [[C3:%.*]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR89:%.*]] = fir.convert [[ADDR88:%.*]] : (!fir.ref<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.complex<4>>>>>
-!CHECK-DAG:    [[ADDR90:%.*]] = fir.convert [[ADDR77]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:    [[ADDR91:%.*]] = fir.coordinate_of [[ADDR90]], [[C4:%.*]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR92:%.*]] = fir.convert [[ADDR91:%.*]] : (!fir.ref<i8>) -> !fir.ref<!fir.char<1,5>>
-!CHECK-DAG:    [[ADDR93:%.*]] = fir.convert [[ADDR77]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:    [[ADDR94:%.*]] = fir.coordinate_of [[ADDR93]], [[C5:%.*]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR95:%.*]] = fir.convert [[ADDR94:%.*]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<2x!fir.char<1,5>>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR80]] : !fir.ref<i32>
-!CHECK-DAG:    %{{.*}} = fir.embox [[ADDR83]](%{{.*}}) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR86]] : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR89]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.complex<4>>>>>
-!CHECK-DAG:    %{{.*}} = fir.convert [[ADDR92]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
-!CHECK-DAG:    %{{.*}} = fir.embox [[ADDR95]](%{{.*}}) : (!fir.ref<!fir.array<2x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1,5>>>
-      print *, a, b, c, d, e, f
-    !$omp end parallel
-
-!CHECK-DAG:  %{{.*}} = fir.load [[ADDR3]] : !fir.ref<i32>
-!CHECK-DAG:  %{{.*}} = fir.embox [[ADDR6]](%{{.*}}) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
-!CHECK-DAG:  %{{.*}} = fir.load [[ADDR9]] : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[ADDR12]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.complex<4>>>>>
-!CHECK-DAG:  %{{.*}} = fir.convert [[ADDR15]] : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
-!CHECK-DAG:  %{{.*}} = fir.embox [[ADDR18]](%{{.*}}) : (!fir.ref<!fir.array<2x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1,5>>>
-    print *, a, b, c, d, e, f
-
-  end
-end
diff --git a/flang/test/Lower/OpenMP/FIR/threadprivate-integer-different-kinds.f90 b/flang/test/Lower/OpenMP/FIR/threadprivate-integer-different-kinds.f90
deleted file mode 100644
index 39c77406cc22..000000000000
--- a/flang/test/Lower/OpenMP/FIR/threadprivate-integer-different-kinds.f90
+++ /dev/null
@@ -1,67 +0,0 @@
-! This test checks lowering of OpenMP Threadprivate Directive.
-! Test for variables with different kind.
-
-!REQUIRES: shell
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-program test
-  integer, save :: i
-  integer(kind=1), save :: i1
-  integer(kind=2), save :: i2
-  integer(kind=4), save :: i4
-  integer(kind=8), save :: i8
-  integer(kind=16), save :: i16
-
-!CHECK-DAG:  [[ADDR0:%.*]] = fir.address_of(@_QFEi) : !fir.ref<i32>
-!CHECK-DAG:  [[NEWADDR0:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<i32> -> !fir.ref<i32>
-!CHECK-DAG:  [[ADDR1:%.*]] = fir.address_of(@_QFEi1) : !fir.ref<i8>
-!CHECK-DAG:  [[NEWADDR1:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<i8> -> !fir.ref<i8>
-!CHECK-DAG:  [[ADDR2:%.*]] = fir.address_of(@_QFEi16) : !fir.ref<i128>
-!CHECK-DAG:  [[NEWADDR2:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<i128> -> !fir.ref<i128>
-!CHECK-DAG:  [[ADDR3:%.*]] = fir.address_of(@_QFEi2) : !fir.ref<i16>
-!CHECK-DAG:  [[NEWADDR3:%.*]] = omp.threadprivate [[ADDR3]] : !fir.ref<i16> -> !fir.ref<i16>
-!CHECK-DAG:  [[ADDR4:%.*]] = fir.address_of(@_QFEi4) : !fir.ref<i32>
-!CHECK-DAG:  [[NEWADDR4:%.*]] = omp.threadprivate [[ADDR4]] : !fir.ref<i32> -> !fir.ref<i32>
-!CHECK-DAG:  [[ADDR5:%.*]] = fir.address_of(@_QFEi8) : !fir.ref<i64>
-!CHECK-DAG:  [[NEWADDR5:%.*]] = omp.threadprivate [[ADDR5]] : !fir.ref<i64> -> !fir.ref<i64>
-  !$omp threadprivate(i, i1, i2, i4, i8, i16)
-
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR0]] : !fir.ref<i32>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR1]] : !fir.ref<i8>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR2]] : !fir.ref<i128>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR3]] : !fir.ref<i16>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR4]] : !fir.ref<i32>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR5]] : !fir.ref<i64>
-  print *, i, i1, i2, i4, i8, i16
-
-  !$omp parallel
-!CHECK-DAG:    [[ADDR39:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<i32> -> !fir.ref<i32>
-!CHECK-DAG:    [[ADDR40:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<i8> -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR41:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<i128> -> !fir.ref<i128>
-!CHECK-DAG:    [[ADDR42:%.*]] = omp.threadprivate [[ADDR3]] : !fir.ref<i16> -> !fir.ref<i16>
-!CHECK-DAG:    [[ADDR43:%.*]] = omp.threadprivate [[ADDR4]] : !fir.ref<i32> -> !fir.ref<i32>
-!CHECK-DAG:    [[ADDR44:%.*]] = omp.threadprivate [[ADDR5]] : !fir.ref<i64> -> !fir.ref<i64>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR39]] : !fir.ref<i32>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR40]] : !fir.ref<i8>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR41]] : !fir.ref<i128>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR42]] : !fir.ref<i16>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR43]] : !fir.ref<i32>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR44]] : !fir.ref<i64>
-    print *, i, i1, i2, i4, i8, i16
-  !$omp end parallel
-
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR0]] : !fir.ref<i32>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR1]] : !fir.ref<i8>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR2]] : !fir.ref<i128>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR3]] : !fir.ref<i16>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR4]] : !fir.ref<i32>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR5]] : !fir.ref<i64>
-  print *, i, i1, i2, i4, i8, i16
-
-!CHECK-DAG: fir.global internal @_QFEi : i32 {
-!CHECK-DAG: fir.global internal @_QFEi1 : i8 {
-!CHECK-DAG: fir.global internal @_QFEi16 : i128 {
-!CHECK-DAG: fir.global internal @_QFEi2 : i16 {
-!CHECK-DAG: fir.global internal @_QFEi4 : i32 {
-!CHECK-DAG: fir.global internal @_QFEi8 : i64 {
-end
diff --git a/flang/test/Lower/OpenMP/FIR/threadprivate-non-global.f90 b/flang/test/Lower/OpenMP/FIR/threadprivate-non-global.f90
deleted file mode 100644
index b089693b2097..000000000000
--- a/flang/test/Lower/OpenMP/FIR/threadprivate-non-global.f90
+++ /dev/null
@@ -1,91 +0,0 @@
-! This test checks lowering of OpenMP Threadprivate Directive.
-! Test for non-character non-SAVEd non-initialized scalars with or without
-! allocatable or pointer attribute in main program.
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-program test
-  integer :: x
-  real :: y
-  logical :: z
-  complex :: w
-  integer, pointer :: a
-  real, allocatable :: b
-
-!CHECK-DAG:  [[ADDR0:%.*]] = fir.address_of(@_QFEa) : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:  [[NEWADDR0:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.box<!fir.ptr<i32>>> -> !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:  [[ADDR1:%.*]] = fir.address_of(@_QFEb) : !fir.ref<!fir.box<!fir.heap<f32>>>
-!CHECK-DAG:  [[NEWADDR1:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<!fir.box<!fir.heap<f32>>> -> !fir.ref<!fir.box<!fir.heap<f32>>>
-!CHECK-DAG:  [[ADDR2:%.*]] = fir.address_of(@_QFEw) : !fir.ref<!fir.complex<4>>
-!CHECK-DAG:  [[NEWADDR2:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<!fir.complex<4>> -> !fir.ref<!fir.complex<4>>
-!CHECK-DAG:  [[ADDR3:%.*]] = fir.address_of(@_QFEx) : !fir.ref<i32>
-!CHECK-DAG:  [[NEWADDR3:%.*]] = omp.threadprivate [[ADDR3]] : !fir.ref<i32> -> !fir.ref<i32>
-!CHECK-DAG:  [[ADDR4:%.*]] = fir.address_of(@_QFEy) : !fir.ref<f32>
-!CHECK-DAG:  [[NEWADDR4:%.*]] = omp.threadprivate [[ADDR4]] : !fir.ref<f32> -> !fir.ref<f32>
-!CHECK-DAG:  [[ADDR5:%.*]] = fir.address_of(@_QFEz) : !fir.ref<!fir.logical<4>>
-!CHECK-DAG:  [[NEWADDR5:%.*]] = omp.threadprivate [[ADDR5]] : !fir.ref<!fir.logical<4>> -> !fir.ref<!fir.logical<4>>
-  !$omp threadprivate(x, y, z, w, a, b)
-
-  call sub(a, b)
-
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR3]] : !fir.ref<i32>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR4]] : !fir.ref<f32>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR5]] : !fir.ref<!fir.logical<4>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR2]] : !fir.ref<!fir.complex<4>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR0]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR1]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  print *, x, y, z, w, a, b
-
-  !$omp parallel
-!CHECK-DAG:    [[ADDR68:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.box<!fir.ptr<i32>>> -> !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:    [[ADDR69:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<!fir.box<!fir.heap<f32>>> -> !fir.ref<!fir.box<!fir.heap<f32>>>
-!CHECK-DAG:    [[ADDR70:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<!fir.complex<4>> -> !fir.ref<!fir.complex<4>>
-!CHECK-DAG:    [[ADDR71:%.*]] = omp.threadprivate [[ADDR3]] : !fir.ref<i32> -> !fir.ref<i32>
-!CHECK-DAG:    [[ADDR72:%.*]] = omp.threadprivate [[ADDR4]] : !fir.ref<f32> -> !fir.ref<f32>
-!CHECK-DAG:    [[ADDR73:%.*]] = omp.threadprivate [[ADDR5]] : !fir.ref<!fir.logical<4>> -> !fir.ref<!fir.logical<4>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR71]] : !fir.ref<i32>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR72]] : !fir.ref<f32>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR73]] : !fir.ref<!fir.logical<4>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR70]] : !fir.ref<!fir.complex<4>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR68]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR69]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-    print *, x, y, z, w, a, b
-  !$omp end parallel
-
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR3]] : !fir.ref<i32>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR4]] : !fir.ref<f32>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR5]] : !fir.ref<!fir.logical<4>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR2]] : !fir.ref<!fir.complex<4>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR0]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR1]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  print *, x, y, z, w, a, b
-
-!CHECK:  return
-
-!CHECK-DAG: fir.global internal @_QFEa : !fir.box<!fir.ptr<i32>> {
-!CHECK-DAG:   [[Z0:%.*]] = fir.zero_bits !fir.ptr<i32>
-!CHECK-DAG:   [[E0:%.*]] = fir.embox [[Z0]] : (!fir.ptr<i32>) -> !fir.box<!fir.ptr<i32>>
-!CHECK-DAG:   fir.has_value [[E0]] : !fir.box<!fir.ptr<i32>>
-!CHECK-DAG: }
-!CHECK-DAG: fir.global internal @_QFEb : !fir.box<!fir.heap<f32>> {
-!CHECK-DAG:   [[Z1:%.*]] = fir.zero_bits !fir.heap<f32>
-!CHECK-DAG:   [[E1:%.*]] = fir.embox [[Z1]] : (!fir.heap<f32>) -> !fir.box<!fir.heap<f32>>
-!CHECK-DAG:   fir.has_value [[E1]] : !fir.box<!fir.heap<f32>>
-!CHECK-DAG: }
-!CHECK-DAG: fir.global internal @_QFEw : !fir.complex<4> {
-!CHECK-DAG:   [[Z2:%.*]] = fir.undefined !fir.complex<4>
-!CHECK-DAG:   fir.has_value [[Z2]] : !fir.complex<4>
-!CHECK-DAG: }
-!CHECK-DAG: fir.global internal @_QFEx : i32 {
-!CHECK-DAG:   [[Z3:%.*]] = fir.undefined i32
-!CHECK-DAG:   fir.has_value [[Z3]] : i32
-!CHECK-DAG: }
-!CHECK-DAG: fir.global internal @_QFEy : f32 {
-!CHECK-DAG:   [[Z4:%.*]] = fir.undefined f32
-!CHECK-DAG:   fir.has_value [[Z4]] : f32
-!CHECK-DAG: }
-!CHECK-DAG: fir.global internal @_QFEz : !fir.logical<4> {
-!CHECK-DAG:   [[Z5:%.*]] = fir.undefined !fir.logical<4>
-!CHECK-DAG:   fir.has_value [[Z5]] : !fir.logical<4>
-!CHECK-DAG: }
-end
diff --git a/flang/test/Lower/OpenMP/FIR/threadprivate-pointer-allocatable.f90 b/flang/test/Lower/OpenMP/FIR/threadprivate-pointer-allocatable.f90
deleted file mode 100644
index fd33c20f9f93..000000000000
--- a/flang/test/Lower/OpenMP/FIR/threadprivate-pointer-allocatable.f90
+++ /dev/null
@@ -1,51 +0,0 @@
-! This test checks lowering of OpenMP Threadprivate Directive.
-! Test for allocatable and pointer variables.
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-module test
-  integer, pointer :: x(:), m
-  real, allocatable :: y(:), n
-
-  !$omp threadprivate(x, y, m, n)
-
-!CHECK-DAG: fir.global @_QMtestEm : !fir.box<!fir.ptr<i32>> {
-!CHECK-DAG: fir.global @_QMtestEn : !fir.box<!fir.heap<f32>> {
-!CHECK-DAG: fir.global @_QMtestEx : !fir.box<!fir.ptr<!fir.array<?xi32>>> {
-!CHECK-DAG: fir.global @_QMtestEy : !fir.box<!fir.heap<!fir.array<?xf32>>> {
-
-contains
-  subroutine sub()
-!CHECK-DAG:  [[ADDR0:%.*]] = fir.address_of(@_QMtestEm) : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:  [[NEWADDR0:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.box<!fir.ptr<i32>>> -> !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:  [[ADDR1:%.*]] = fir.address_of(@_QMtestEn) : !fir.ref<!fir.box<!fir.heap<f32>>>
-!CHECK-DAG:  [[NEWADDR1:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<!fir.box<!fir.heap<f32>>> -> !fir.ref<!fir.box<!fir.heap<f32>>>
-!CHECK-DAG:  [[ADDR2:%.*]] = fir.address_of(@_QMtestEx) : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
-!CHECK-DAG:  [[NEWADDR2:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
-!CHECK-DAG:  [[ADDR3:%.*]] = fir.address_of(@_QMtestEy) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-!CHECK-DAG:  [[NEWADDR3:%.*]] = omp.threadprivate [[ADDR3]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR2]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR3]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR0]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR1]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-    print *, x, y, m, n
-
-    !$omp parallel
-!CHECK-DAG:    [[ADDR54:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.box<!fir.ptr<i32>>> -> !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:    [[ADDR55:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<!fir.box<!fir.heap<f32>>> -> !fir.ref<!fir.box<!fir.heap<f32>>>
-!CHECK-DAG:    [[ADDR56:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
-!CHECK-DAG:    [[ADDR57:%.*]] = omp.threadprivate [[ADDR3]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR56]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR57]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR54]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR55]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-      print *, x, y, m, n
-    !$omp end parallel
-
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR2]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR3]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR0]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR1]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-    print *, x, y, m, n
-  end
-end
diff --git a/flang/test/Lower/OpenMP/FIR/threadprivate-real-logical-complex-derivedtype.f90 b/flang/test/Lower/OpenMP/FIR/threadprivate-real-logical-complex-derivedtype.f90
deleted file mode 100644
index 749fe5c8bf54..000000000000
--- a/flang/test/Lower/OpenMP/FIR/threadprivate-real-logical-complex-derivedtype.f90
+++ /dev/null
@@ -1,58 +0,0 @@
-! This test checks lowering of OpenMP Threadprivate Directive.
-! Test for real, logical, complex, and derived type.
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-module test
-  type my_type
-    integer :: t_i
-    real :: t_arr(5)
-  end type my_type
-  real :: x
-  complex :: y
-  logical :: z
-  type(my_type) :: t
-
-  !$omp threadprivate(x, y, z, t)
-
-!CHECK-DAG: fir.global @_QMtestEt : !fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}> {
-!CHECK-DAG: fir.global @_QMtestEx : f32 {
-!CHECK-DAG: fir.global @_QMtestEy : !fir.complex<4> {
-!CHECK-DAG: fir.global @_QMtestEz : !fir.logical<4> {
-
-contains
-  subroutine sub()
-!CHECK-DAG:  [[ADDR0:%.*]] = fir.address_of(@_QMtestEt) : !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>
-!CHECK-DAG:  [[NEWADDR0:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>> -> !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>
-!CHECK-DAG:  [[ADDR1:%.*]] = fir.address_of(@_QMtestEx) : !fir.ref<f32>
-!CHECK-DAG:  [[NEWADDR1:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<f32> -> !fir.ref<f32>
-!CHECK-DAG:  [[ADDR2:%.*]] = fir.address_of(@_QMtestEy) : !fir.ref<!fir.complex<4>>
-!CHECK-DAG:  [[NEWADDR2:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<!fir.complex<4>> -> !fir.ref<!fir.complex<4>>
-!CHECK-DAG:  [[ADDR3:%.*]] = fir.address_of(@_QMtestEz) : !fir.ref<!fir.logical<4>>
-!CHECK-DAG:  [[NEWADDR3:%.*]] = omp.threadprivate [[ADDR3]] : !fir.ref<!fir.logical<4>> -> !fir.ref<!fir.logical<4>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR1]] : !fir.ref<f32>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR2]] : !fir.ref<!fir.complex<4>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR3]] : !fir.ref<!fir.logical<4>>
-!CHECK-DAG:  %{{.*}} = fir.coordinate_of [[NEWADDR0]]
-    print *, x, y, z, t%t_i
-
-    !$omp parallel
-!CHECK-DAG:    [[ADDR38:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>> -> !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>
-!CHECK-DAG:    [[ADDR39:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<f32> -> !fir.ref<f32>
-!CHECK-DAG:    [[ADDR40:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<!fir.complex<4>> -> !fir.ref<!fir.complex<4>>
-!CHECK-DAG:    [[ADDR41:%.*]] = omp.threadprivate [[ADDR3]] : !fir.ref<!fir.logical<4>> -> !fir.ref<!fir.logical<4>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR39]] : !fir.ref<f32>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR40]] : !fir.ref<!fir.complex<4>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR41]] : !fir.ref<!fir.logical<4>>
-!CHECK-DAG:    %{{.*}} = fir.coordinate_of [[ADDR38]]
-      print *, x, y, z, t%t_i
-    !$omp end parallel
-
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR1]] : !fir.ref<f32>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR2]] : !fir.ref<!fir.complex<4>>
-!CHECK-DAG:  %{{.*}} = fir.load [[NEWADDR3]] : !fir.ref<!fir.logical<4>>
-!CHECK-DAG:  %{{.*}} = fir.coordinate_of [[NEWADDR0]]
-    print *, x, y, z, t%t_i
-
-  end
-end
diff --git a/flang/test/Lower/OpenMP/FIR/threadprivate-use-association-2.f90 b/flang/test/Lower/OpenMP/FIR/threadprivate-use-association-2.f90
deleted file mode 100644
index 6db5735c21f1..000000000000
--- a/flang/test/Lower/OpenMP/FIR/threadprivate-use-association-2.f90
+++ /dev/null
@@ -1,39 +0,0 @@
-! This test checks lowering of OpenMP Threadprivate Directive.
-! Test for threadprivate variable double use in use association.
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-!RUN: bbc -emit-fir -hlfir=false -fopenmp %s -o - | FileCheck %s
-
-! CHECK-LABEL: fir.global @_QMmEx : i32
-module m
-  integer :: x
-  !$omp threadprivate(x)
-end
-
-! CHECK-LABEL: func.func @_QMm2Ptest() {
-! CHECK:         %[[VAL_0:.*]] = fir.address_of(@_QMmEx) : !fir.ref<i32>
-! CHECK:         %[[VAL_1:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<i32> -> !fir.ref<i32>
-! CHECK:         fir.call @_QPbar(%[[VAL_1]]) {{.*}}: (!fir.ref<i32>) -> ()
-! CHECK:         return
-! CHECK:       }
-!
-! CHECK-LABEL: func.func private @_QMm2FtestPinternal_test() {{.*}} {
-! CHECK:         %[[VAL_0:.*]] = fir.address_of(@_QMmEx) : !fir.ref<i32>
-! CHECK:         %[[VAL_1:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<i32> -> !fir.ref<i32>
-! CHECK:         fir.call @_QPbar(%[[VAL_1]]) {{.*}}: (!fir.ref<i32>) -> ()
-! CHECK:         return
-! CHECK:       }
-
-module m2
-  use m
- contains
-  subroutine test()
-    use m
-    call bar(x)
-   contains
-    subroutine internal_test()
-      use m
-      call bar(x)
-    end
-  end
-end
diff --git a/flang/test/Lower/OpenMP/FIR/threadprivate-use-association.f90 b/flang/test/Lower/OpenMP/FIR/threadprivate-use-association.f90
deleted file mode 100644
index 685237430a1c..000000000000
--- a/flang/test/Lower/OpenMP/FIR/threadprivate-use-association.f90
+++ /dev/null
@@ -1,74 +0,0 @@
-! This test checks lowering of OpenMP Threadprivate Directive.
-! Test for threadprivate variable in use association.
-
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-!CHECK-DAG: fir.global common @blk_(dense<0> : vector<24xi8>) : !fir.array<24xi8>
-!CHECK-DAG: fir.global @_QMtestEy : f32 {
-
-module test
-  integer :: x
-  real :: y, z(5)
-  common /blk/ x, z
-
-  !$omp threadprivate(y, /blk/)
-
-contains
-  subroutine sub()
-! CHECK-LABEL: @_QMtestPsub
-!CHECK-DAG:   [[ADDR0:%.*]] = fir.address_of(@blk_) : !fir.ref<!fir.array<24xi8>>
-!CHECK-DAG:   [[NEWADDR0:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.array<24xi8>> -> !fir.ref<!fir.array<24xi8>>
-!CHECK-DAG:   [[ADDR1:%.*]] = fir.address_of(@_QMtestEy) : !fir.ref<f32>
-!CHECK-DAG:   [[NEWADDR1:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<f32> -> !fir.ref<f32>
-
-    !$omp parallel
-!CHECK-DAG:    [[ADDR2:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.array<24xi8>> -> !fir.ref<!fir.array<24xi8>>
-!CHECK-DAG:    [[ADDR3:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<f32> -> !fir.ref<f32>
-!CHECK-DAG:    [[ADDR4:%.*]] = fir.convert [[ADDR2]] : (!fir.ref<!fir.array<24xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:    [[ADDR5:%.*]] = fir.coordinate_of [[ADDR4]], %{{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR6:%.*]] = fir.convert [[ADDR5:%.*]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK-DAG:    [[ADDR7:%.*]] = fir.convert [[ADDR2]] : (!fir.ref<!fir.array<24xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:    [[ADDR8:%.*]] = fir.coordinate_of [[ADDR7]], %{{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR9:%.*]] = fir.convert [[ADDR8:%.*]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<5xf32>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR6]] : !fir.ref<i32>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR3]] : !fir.ref<f32>
-!CHECK-DAG:    %{{.*}} = fir.embox [[ADDR9]](%{{.*}}) : (!fir.ref<!fir.array<5xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<5xf32>>
-      print *, x, y, z
-    !$omp end parallel
-  end
-end
-
-program main
-  use test
-  integer :: x1
-  real :: z1(5)
-  common /blk/ x1, z1
-
-  !$omp threadprivate(/blk/)
-
-  call sub()
-
-! CHECK-LABEL: @_QQmain()
-!CHECK-DAG:  [[ADDR0:%.*]] = fir.address_of(@blk_) : !fir.ref<!fir.array<24xi8>>
-!CHECK-DAG:  [[NEWADDR0:%.*]] = omp.threadprivate [[ADDR0]] : !fir.ref<!fir.array<24xi8>> -> !fir.ref<!fir.array<24xi8>>
-!CHECK-DAG:  [[ADDR1:%.*]] = fir.address_of(@blk_) : !fir.ref<!fir.array<24xi8>>
-!CHECK-DAG:  [[NEWADDR1:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<!fir.array<24xi8>> -> !fir.ref<!fir.array<24xi8>>
-!CHECK-DAG:  [[ADDR2:%.*]] = fir.address_of(@_QMtestEy) : !fir.ref<f32>
-!CHECK-DAG:  [[NEWADDR2:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<f32> -> !fir.ref<f32>
-
-  !$omp parallel
-!CHECK-DAG:    [[ADDR4:%.*]] = omp.threadprivate [[ADDR1]] : !fir.ref<!fir.array<24xi8>> -> !fir.ref<!fir.array<24xi8>>
-!CHECK-DAG:    [[ADDR5:%.*]] = omp.threadprivate [[ADDR2]] : !fir.ref<f32> -> !fir.ref<f32>
-!CHECK-DAG:    [[ADDR6:%.*]] = fir.convert [[ADDR4]] : (!fir.ref<!fir.array<24xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:    [[ADDR7:%.*]] = fir.coordinate_of [[ADDR6]], %{{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR8:%.*]] = fir.convert [[ADDR7:%.*]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK-DAG:    [[ADDR9:%.*]] = fir.convert [[ADDR4]] : (!fir.ref<!fir.array<24xi8>>) -> !fir.ref<!fir.array<?xi8>>
-!CHECK-DAG:    [[ADDR10:%.*]] = fir.coordinate_of [[ADDR9]], %{{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK-DAG:    [[ADDR11:%.*]] = fir.convert [[ADDR10:%.*]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<5xf32>>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR8]] : !fir.ref<i32>
-!CHECK-DAG:    %{{.*}} = fir.load [[ADDR5]] : !fir.ref<f32>
-!CHECK-DAG:    %{{.*}} = fir.embox [[ADDR11]](%{{.*}}) : (!fir.ref<!fir.array<5xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<5xf32>>
-    print *, x1, y, z1
-  !$omp end parallel
-
-end
diff --git a/flang/test/Lower/OpenMP/FIR/unstructured.f90 b/flang/test/Lower/OpenMP/FIR/unstructured.f90
deleted file mode 100644
index 6d1c9aab1464..000000000000
--- a/flang/test/Lower/OpenMP/FIR/unstructured.f90
+++ /dev/null
@@ -1,365 +0,0 @@
-! Test unstructured code adjacent to and inside OpenMP constructs.
-
-! RUN: bbc %s -fopenmp -emit-fir -hlfir=false -o "-" | FileCheck %s
-
-! CHECK-LABEL: func @_QPss1{{.*}} {
-! CHECK:   br ^bb1
-! CHECK: ^bb1:  // 2 preds: ^bb0, ^bb4
-! CHECK:   cond_br %{{[0-9]*}}, ^bb2, ^bb5
-! CHECK: ^bb2:  // pred: ^bb1
-! CHECK:   cond_br %{{[0-9]*}}, ^bb3, ^bb4
-! CHECK: ^bb4:  // pred: ^bb2
-! CHECK:   fir.call @_FortranAioBeginExternalListOutput
-! CHECK:   br ^bb1
-! CHECK: ^bb5:  // 2 preds: ^bb1, ^bb3
-! CHECK:   omp.master  {
-! CHECK:     @_FortranAioBeginExternalListOutput
-! CHECK:     omp.terminator
-! CHECK:   }
-! CHECK:   @_FortranAioBeginExternalListOutput
-! CHECK: }
-subroutine ss1(n) ! unstructured code followed by a structured OpenMP construct
-  do i = 1, 3
-    if (i .eq. n) exit
-    print*, 'ss1-A', i
-  enddo
-  !$omp master
-    print*, 'ss1-B', i
-  !$omp end master
-  print*
-end
-
-! CHECK-LABEL: func @_QPss2{{.*}} {
-! CHECK:   omp.master  {
-! CHECK:     @_FortranAioBeginExternalListOutput
-! CHECK:     br ^bb1
-! CHECK:   ^bb1:  // 2 preds: ^bb0, ^bb4
-! CHECK:     cond_br %{{[0-9]*}}, ^bb2, ^bb5
-! CHECK:   ^bb2:  // pred: ^bb1
-! CHECK:     cond_br %{{[0-9]*}}, ^bb3, ^bb4
-! CHECK:   ^bb3:  // pred: ^bb2
-! CHECK:     @_FortranAioBeginExternalListOutput
-! CHECK:     br ^bb1
-! CHECK:   ^bb5:  // 2 preds: ^bb1, ^bb3
-! CHECK:     omp.terminator
-! CHECK:   }
-! CHECK:   @_FortranAioBeginExternalListOutput
-! CHECK:   @_FortranAioBeginExternalListOutput
-! CHECK: }
-subroutine ss2(n) ! unstructured OpenMP construct; loop exit inside construct
-  !$omp master
-    print*, 'ss2-A', n
-    do i = 1, 3
-      if (i .eq. n) exit
-      print*, 'ss2-B', i
-    enddo
-  !$omp end master
-  print*, 'ss2-C', i
-  print*
-end
-
-! CHECK-LABEL: func @_QPss3{{.*}} {
-! CHECK:   omp.parallel {
-! CHECK:     %[[ALLOCA_K:.*]] = fir.alloca i32 {bindc_name = "k", pinned}
-! CHECK:     %[[ALLOCA_1:.*]] = fir.alloca i32 {{{.*}}, pinned}
-! CHECK:     %[[ALLOCA_2:.*]] = fir.alloca i32 {{{.*}}, pinned}
-! CHECK:     br ^bb1
-! CHECK:   ^bb1:  // 2 preds: ^bb0, ^bb3
-! CHECK:     cond_br %{{[0-9]*}}, ^bb2, ^bb4
-! CHECK:   ^bb2:  // pred: ^bb1
-! CHECK:     omp.wsloop {
-! CHECK:       omp.loop_nest (%[[ARG1:.*]]) : {{.*}} {
-! CHECK:         fir.store %[[ARG1]] to %[[ALLOCA_2]] : !fir.ref<i32>
-! CHECK:         @_FortranAioBeginExternalListOutput
-! CHECK:         %[[LOAD_1:.*]] = fir.load %[[ALLOCA_2]] : !fir.ref<i32>
-! CHECK:         @_FortranAioOutputInteger32(%{{.*}}, %[[LOAD_1]])
-! CHECK:         omp.yield
-! CHECK:       }
-! CHECK:       omp.terminator
-! CHECK:     }
-! CHECK:     omp.wsloop {
-! CHECK:       omp.loop_nest (%[[ARG2:.*]]) : {{.*}} {
-! CHECK:         fir.store %[[ARG2]] to %[[ALLOCA_1]] : !fir.ref<i32>
-! CHECK:         br ^bb1
-! CHECK:       ^bb2:  // 2 preds: ^bb1, ^bb5
-! CHECK:         cond_br %{{[0-9]*}}, ^bb3, ^bb6
-! CHECK:       ^bb3:  // pred: ^bb2
-! CHECK:         cond_br %{{[0-9]*}}, ^bb4, ^bb5
-! CHECK:       ^bb4:  // pred: ^bb3
-! CHECK:         @_FortranAioBeginExternalListOutput
-! CHECK:         %[[LOAD_2:.*]] = fir.load %[[ALLOCA_K]] : !fir.ref<i32>
-! CHECK:         @_FortranAioOutputInteger32(%{{.*}}, %[[LOAD_2]])
-! CHECK:         br ^bb2
-! CHECK:       ^bb6:  // 2 preds: ^bb2, ^bb4
-! CHECK:         omp.yield
-! CHECK:       }
-! CHECK:       omp.terminator
-! CHECK:     }
-! CHECK:     br ^bb1
-! CHECK:   ^bb4:  // pred: ^bb1
-! CHECK:     omp.terminator
-! CHECK:   }
-! CHECK: }
-subroutine ss3(n) ! nested unstructured OpenMP constructs
-  !$omp parallel
-    do i = 1, 3
-      !$omp do
-        do k = 1, 3
-          print*, 'ss3-A', k
-        enddo
-      !$omp end do
-      !$omp do
-        do j = 1, 3
-          do k = 1, 3
-            if (k .eq. n) exit
-            print*, 'ss3-B', k
-          enddo
-        enddo
-      !$omp end do
-    enddo
-  !$omp end parallel
-end
-
-! CHECK-LABEL: func @_QPss4{{.*}} {
-! CHECK:       omp.parallel {
-! CHECK:         %[[ALLOCA:.*]] = fir.alloca i32 {{{.*}}, pinned}
-! CHECK:         omp.wsloop {
-! CHECK:           omp.loop_nest (%[[ARG:.*]]) : {{.*}} {
-! CHECK:             fir.store %[[ARG]] to %[[ALLOCA]] : !fir.ref<i32>
-! CHECK:             %[[COND:.*]] = arith.cmpi eq, %{{.*}}, %{{.*}}
-! CHECK:             %[[COND_XOR:.*]] = arith.xori %[[COND]], %{{.*}}
-! CHECK:             fir.if %[[COND_XOR]] {
-! CHECK:              @_FortranAioBeginExternalListOutput
-! CHECK:              %[[LOAD:.*]] = fir.load %[[ALLOCA]] : !fir.ref<i32>
-! CHECK:              @_FortranAioOutputInteger32(%{{.*}}, %[[LOAD]])
-! CHECK:             } else {
-! CHECK:             }
-! CHECK-NEXT:        omp.yield
-! CHECK-NEXT:      }
-! CHECK-NEXT:      omp.terminator
-! CHECK-NEXT:    }
-! CHECK:         omp.terminator
-! CHECK-NEXT:  }
-subroutine ss4(n) ! CYCLE in OpenMP wsloop constructs
-  !$omp parallel
-    do i = 1, 3
-      !$omp do
-        do j = 1, 3
-           if (j .eq. n) cycle
-           print*, 'ss4', j
-        enddo
-      !$omp end do
-    enddo
-  !$omp end parallel
-end
-
-! CHECK-LABEL: func @_QPss5() {
-! CHECK:  omp.parallel  {
-! CHECK:    omp.wsloop {
-! CHECK:      omp.loop_nest {{.*}} {
-! CHECK:        br ^[[BB1:.*]]
-! CHECK:      ^[[BB1]]:
-! CHECK:        br ^[[BB2:.*]]
-! CHECK:      ^[[BB2]]:
-! CHECK:        cond_br %{{.*}}, ^[[BB3:.*]], ^[[BB6:.*]]
-! CHECK:      ^[[BB3]]:
-! CHECK:        cond_br %{{.*}}, ^[[BB4:.*]], ^[[BB3:.*]]
-! CHECK:      ^[[BB4]]:
-! CHECK:        br ^[[BB6]]
-! CHECK:      ^[[BB3]]:
-! CHECK:        br ^[[BB2]]
-! CHECK:      ^[[BB6]]:
-! CHECK:        omp.yield
-! CHECK:      }
-! CHECK:      omp.terminator
-! CHECK:    }
-! CHECK:    omp.terminator
-! CHECK:  }
-subroutine ss5() ! EXIT inside OpenMP wsloop (inside parallel)
-  integer :: x
-  !$omp parallel private(x)
-    !$omp do
-      do j = 1, 3
-        x = j * i
-        do k = 1, 3
-          if (k .eq. n) exit
-          x = k
-          x = x + k
-        enddo
-        x = j - 222
-      enddo
-    !$omp end do
-  !$omp end parallel
-end
-
-! CHECK-LABEL: func @_QPss6() {
-! CHECK:  omp.parallel  {
-! CHECK:    br ^[[BB1_OUTER:.*]]
-! CHECK:  ^[[BB1_OUTER]]:
-! CHECK:    cond_br %{{.*}}, ^[[BB2_OUTER:.*]], ^[[BB3_OUTER:.*]]
-! CHECK:  ^[[BB2_OUTER]]:
-! CHECK:    omp.wsloop {
-! CHECK:      omp.loop_nest {{.*}} {
-! CHECK:        br ^[[BB1:.*]]
-! CHECK:      ^[[BB1]]:
-! CHECK:        br ^[[BB2:.*]]
-! CHECK:      ^[[BB2]]:
-! CHECK:        cond_br %{{.*}}, ^[[BB3:.*]], ^[[BB6:.*]]
-! CHECK:      ^[[BB3]]:
-! CHECK:        cond_br %{{.*}}, ^[[BB4:.*]], ^[[BB5:.*]]
-! CHECK:      ^[[BB4]]:
-! CHECK:        br ^[[BB6]]
-! CHECK:      ^[[BB5]]
-! CHECK:        br ^[[BB2]]
-! CHECK:      ^[[BB6]]:
-! CHECK:        omp.yield
-! CHECK:      }
-! CHECK:      omp.terminator
-! CHECK:    }
-! CHECK:    br ^[[BB1_OUTER]]
-! CHECK:  ^[[BB3_OUTER]]:
-! CHECK:    omp.terminator
-! CHECK:  }
-subroutine ss6() ! EXIT inside OpenMP wsloop in a do loop (inside parallel)
-  integer :: x
-  !$omp parallel private(x)
-    do i = 1, 3
-      !$omp do
-        do j = 1, 3
-          x = j * i
-          do k = 1, 3
-            if (k .eq. n) exit
-            x = k
-            x = x + k
-          enddo
-          x = j - 222
-        enddo
-      !$omp end do
-    enddo
-  !$omp end parallel
-end
-
-! CHECK-LABEL: func @_QPss7() {
-! CHECK: br ^[[BB1_OUTER:.*]]
-! CHECK: ^[[BB1_OUTER]]:
-! CHECK:   cond_br %{{.*}}, ^[[BB2_OUTER:.*]], ^[[BB3_OUTER:.*]]
-! CHECK-NEXT: ^[[BB2_OUTER:.*]]:
-! CHECK:   omp.parallel  {
-! CHECK:     omp.wsloop {
-! CHECK:       omp.loop_nest {{.*}} {
-! CHECK:         br ^[[BB1:.*]]
-! CHECK-NEXT:       ^[[BB1]]:
-! CHECK:         br ^[[BB2:.*]]
-! CHECK-NEXT:       ^[[BB2]]:
-! CHECK:         cond_br %{{.*}}, ^[[BB3:.*]], ^[[BB6:.*]]
-! CHECK-NEXT:       ^[[BB3]]:
-! CHECK:         cond_br %{{.*}}, ^[[BB4:.*]], ^[[BB5:.*]]
-! CHECK-NEXT:       ^[[BB4]]:
-! CHECK:         br ^[[BB6]]
-! CHECK-NEXT:       ^[[BB5]]:
-! CHECK:         br ^[[BB2]]
-! CHECK-NEXT:       ^[[BB6]]:
-! CHECK:         omp.yield
-! CHECK:       }
-! CHECK:       omp.terminator
-! CHECK:     }
-! CHECK:     omp.terminator
-! CHECK:   }
-! CHECK:   br ^[[BB1_OUTER]]
-! CHECK-NEXT: ^[[BB3_OUTER]]:
-! CHECK-NEXT:   return
-subroutine ss7() ! EXIT inside OpenMP parallel do (inside do loop)
-  integer :: x
-    do i = 1, 3
-      !$omp parallel do private(x)
-        do j = 1, 3
-          x = j * i
-          do k = 1, 3
-            if (k .eq. n) exit
-            x = k
-            x = x + k
-          enddo
-        enddo
-      !$omp end parallel do
-    enddo
-end
-
-! CHECK-LABEL: func @_QPss8() {
-! CHECK:  omp.parallel  {
-! CHECK:    omp.wsloop {
-! CHECK:      omp.loop_nest {{.*}} {
-! CHECK:        br ^[[BB1:.*]]
-! CHECK-NEXT:      ^[[BB1]]:
-! CHECK:        br ^[[BB2:.*]]
-! CHECK:      ^[[BB2]]:
-! CHECK:        cond_br %{{.*}}, ^[[BB3:.*]], ^[[BB6:.*]]
-! CHECK:      ^[[BB3]]:
-! CHECK:        cond_br %{{.*}}, ^[[BB4:.*]], ^[[BB5:.*]]
-! CHECK:      ^[[BB4]]:
-! CHECK-NEXT:      br ^[[BB6]]
-! CHECK:      ^[[BB5]]:
-! CHECK:        br ^[[BB2]]
-! CHECK-NEXT:      ^[[BB6]]:
-! CHECK:        omp.yield
-! CHECK:      }
-! CHECK:      omp.terminator
-! CHECK:    }
-! CHECK:    omp.terminator
-! CHECK:  }
-subroutine ss8() ! EXIT inside OpenMP parallel do
-  integer :: x
-      !$omp parallel do private(x)
-        do j = 1, 3
-          x = j * i
-          do k = 1, 3
-            if (k .eq. n) exit
-            x = k
-            x = x + k
-          enddo
-        enddo
-      !$omp end parallel do
-end
-
-! CHECK-LABEL: func @_QPss9() {
-! CHECK:  omp.parallel  {
-! CHECK-NEXT:    omp.parallel  {
-! CHECK:      br ^[[BB1:.*]]
-! CHECK:         ^[[BB1]]:
-! CHECK:      cond_br %{{.*}}, ^[[BB2:.*]], ^[[BB5:.*]]
-! CHECK-NEXT:    ^[[BB2]]:
-! CHECK:      cond_br %{{.*}}, ^[[BB3:.*]], ^[[BB4:.*]]
-! CHECK-NEXT:    ^[[BB3]]:
-! CHECK-NEXT:    br ^[[BB5]]
-! CHECK-NEXT:    ^[[BB4]]:
-! CHECK:      br ^[[BB1]]
-! CHECK-NEXT:    ^[[BB5]]:
-! CHECK:      omp.terminator
-! CHECK-NEXT:    }
-! CHECK:    omp.terminator
-! CHECK-NEXT  }
-! CHECK: }
-subroutine ss9() ! EXIT inside OpenMP parallel (inside parallel)
-  integer :: x
-  !$omp parallel
-  !$omp parallel private(x)
-    do k = 1, 3
-      if (k .eq. n) exit
-      x = k
-      x = x + k
-    end do
-  !$omp end parallel
-  !$omp end parallel
-end
-
-! CHECK-LABEL: func @_QQmain
-program p
-  call ss1(2)
-  call ss2(2)
-  call ss3(2)
-  call ss4(2)
-  call ss5()
-  call ss6()
-  call ss7()
-  call ss8()
-  call ss9()
-end
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-chunks.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-chunks.f90
deleted file mode 100644
index e4b85fb44776..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-chunks.f90
+++ /dev/null
@@ -1,84 +0,0 @@
-! This test checks that chunk size is passed correctly when lowering of
-! OpenMP DO Directive(Worksharing) with chunk size
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-program wsloop
-        integer :: i
-        integer :: chunk
-
-! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "wsloop"} {
-! CHECK:         %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "chunk", uniq_name = "_QFEchunk"}
-
-!$OMP DO SCHEDULE(static, 4)
-
-do i=1, 9
-  print*, i
-
-! CHECK:         %[[VAL_2:.*]] = arith.constant 1 : i32
-! CHECK:         %[[VAL_3:.*]] = arith.constant 9 : i32
-! CHECK:         %[[VAL_4:.*]] = arith.constant 1 : i32
-! CHECK:         %[[VAL_5:.*]] = arith.constant 4 : i32
-! CHECK:         omp.wsloop schedule(static = %[[VAL_5]] : i32) nowait {
-! CHECK-NEXT:      omp.loop_nest (%[[ARG0:.*]]) : i32 = (%[[VAL_2]]) to (%[[VAL_3]]) inclusive step (%[[VAL_4]]) {
-! CHECK:             fir.store %[[ARG0]] to %[[STORE_IV:.*]] : !fir.ref<i32>
-! CHECK:             %[[LOAD_IV:.*]] = fir.load %[[STORE_IV]] : !fir.ref<i32>
-! CHECK:             {{.*}} = fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-! CHECK:             omp.yield
-! CHECK:           }
-! CHECK:           omp.terminator
-! CHECK:         }
-
-end do
-!$OMP END DO NOWAIT
-!$OMP DO SCHEDULE(static, 2+2)
-
-do i=1, 9
-  print*, i*2
-
-! CHECK:         %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:         %[[VAL_15:.*]] = arith.constant 9 : i32
-! CHECK:         %[[VAL_16:.*]] = arith.constant 1 : i32
-! CHECK:         %[[VAL_17:.*]] = arith.constant 4 : i32
-! CHECK:         omp.wsloop schedule(static = %[[VAL_17]] : i32) nowait {
-! CHECK-NEXT:      omp.loop_nest (%[[ARG1:.*]]) : i32 = (%[[VAL_14]]) to (%[[VAL_15]]) inclusive step (%[[VAL_16]]) {
-! CHECK:             fir.store %[[ARG1]] to %[[STORE_IV1:.*]] : !fir.ref<i32>
-! CHECK:             %[[VAL_24:.*]] = arith.constant 2 : i32
-! CHECK:             %[[LOAD_IV1:.*]] = fir.load %[[STORE_IV1]] : !fir.ref<i32>
-! CHECK:             %[[VAL_25:.*]] = arith.muli %[[VAL_24]], %[[LOAD_IV1]] : i32
-! CHECK:             {{.*}} = fir.call @_FortranAioOutputInteger32({{.*}}, %[[VAL_25]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-! CHECK:             omp.yield
-! CHECK:           }
-! CHECK:           omp.terminator
-! CHECK:         }
-  
-end do
-!$OMP END DO NOWAIT
-chunk = 6
-!$OMP DO SCHEDULE(static, chunk)
-
-do i=1, 9
-   print*, i*3
-end do
-!$OMP END DO NOWAIT
-! CHECK:         %[[VAL_28:.*]] = arith.constant 6 : i32
-! CHECK:         fir.store %[[VAL_28]] to %[[VAL_0]] : !fir.ref<i32>
-! CHECK:         %[[VAL_29:.*]] = arith.constant 1 : i32
-! CHECK:         %[[VAL_30:.*]] = arith.constant 9 : i32
-! CHECK:         %[[VAL_31:.*]] = arith.constant 1 : i32
-! CHECK:         %[[VAL_32:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
-! CHECK:         omp.wsloop schedule(static = %[[VAL_32]] : i32) nowait {
-! CHECK-NEXT:      omp.loop_nest (%[[ARG2:.*]]) : i32 = (%[[VAL_29]]) to (%[[VAL_30]]) inclusive step (%[[VAL_31]]) {
-! CHECK:             fir.store %[[ARG2]] to %[[STORE_IV2:.*]] : !fir.ref<i32>
-! CHECK:             %[[VAL_39:.*]] = arith.constant 3 : i32
-! CHECK:             %[[LOAD_IV2:.*]] = fir.load %[[STORE_IV2]] : !fir.ref<i32>
-! CHECK:             %[[VAL_40:.*]] = arith.muli %[[VAL_39]], %[[LOAD_IV2]] : i32
-! CHECK:             {{.*}} = fir.call @_FortranAioOutputInteger32({{.*}}, %[[VAL_40]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-! CHECK:             omp.yield
-! CHECK:           }
-! CHECK:           omp.terminator
-! CHECK:         }
-! CHECK:         return
-! CHECK:       }
-
-end
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-collapse.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-collapse.f90
deleted file mode 100644
index a2ba3ebfe196..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-collapse.f90
+++ /dev/null
@@ -1,66 +0,0 @@
-! This test checks lowering of OpenMP DO Directive(Worksharing) with collapse.
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-program wsloop_collapse
-  integer :: i, j, k
-  integer :: a, b, c
-  integer :: x
-! CHECK:         %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"}
-! CHECK:         %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "b", uniq_name = "_QFEb"}
-! CHECK:         %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "c", uniq_name = "_QFEc"}
-! CHECK:         %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
-! CHECK:         %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFEj"}
-! CHECK:         %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "k", uniq_name = "_QFEk"}
-! CHECK:         %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
-  a=3
-! CHECK:         %[[VAL_7:.*]] = arith.constant 3 : i32
-! CHECK:         fir.store %[[VAL_7]] to %[[VAL_0]] : !fir.ref<i32>
-  b=2
-! CHECK:         %[[VAL_8:.*]] = arith.constant 2 : i32
-! CHECK:         fir.store %[[VAL_8]] to %[[VAL_1]] : !fir.ref<i32>
-  c=5
-! CHECK:         %[[VAL_9:.*]] = arith.constant 5 : i32
-! CHECK:         fir.store %[[VAL_9]] to %[[VAL_2]] : !fir.ref<i32>
-  x=0
-! CHECK:         %[[VAL_10:.*]] = arith.constant 0 : i32
-! CHECK:         fir.store %[[VAL_10]] to %[[VAL_6]] : !fir.ref<i32>
-
-  !$omp do collapse(3)
-! CHECK:           %[[VAL_20:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_21:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
-! CHECK:           %[[VAL_22:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_23:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_24:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
-! CHECK:           %[[VAL_25:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_26:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_27:.*]] = fir.load %[[VAL_2]] : !fir.ref<i32>
-! CHECK:           %[[VAL_28:.*]] = arith.constant 1 : i32
-  do i = 1, a
-     do j= 1, b
-        do k = 1, c
-! CHECK:           omp.wsloop {
-! CHECK-NEXT:        omp.loop_nest (%[[ARG0:.*]], %[[ARG1:.*]], %[[ARG2:.*]]) : i32 = (%[[VAL_20]], %[[VAL_23]], %[[VAL_26]]) to (%[[VAL_21]], %[[VAL_24]], %[[VAL_27]]) inclusive step (%[[VAL_22]], %[[VAL_25]], %[[VAL_28]]) {
-! CHECK:               fir.store %[[ARG0]] to %[[STORE_IV0:.*]] : !fir.ref<i32>
-! CHECK:               fir.store %[[ARG1]] to %[[STORE_IV1:.*]] : !fir.ref<i32>
-! CHECK:               fir.store %[[ARG2]] to %[[STORE_IV2:.*]] : !fir.ref<i32>
-! CHECK:               %[[VAL_12:.*]] = fir.load %[[VAL_6]] : !fir.ref<i32>
-! CHECK:               %[[LOAD_IV0:.*]] = fir.load %[[STORE_IV0]] : !fir.ref<i32>
-! CHECK:               %[[VAL_13:.*]] = arith.addi %[[VAL_12]], %[[LOAD_IV0]] : i32
-! CHECK:               %[[LOAD_IV1:.*]] = fir.load %[[STORE_IV1]] : !fir.ref<i32>
-! CHECK:               %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[LOAD_IV1]] : i32
-! CHECK:               %[[LOAD_IV2:.*]] = fir.load %[[STORE_IV2]] : !fir.ref<i32>
-! CHECK:               %[[VAL_15:.*]] = arith.addi %[[VAL_14]], %[[LOAD_IV2]] : i32
-! CHECK:               fir.store %[[VAL_15]] to %[[VAL_6]] : !fir.ref<i32>
-! CHECK:               omp.yield
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-           x = x + i + j + k
-        end do
-     end do
-  end do
-  !$omp end do
-! CHECK:         return
-! CHECK:       }
-end program wsloop_collapse
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-monotonic.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-monotonic.f90
deleted file mode 100644
index 941885bdb1e3..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-monotonic.f90
+++ /dev/null
@@ -1,38 +0,0 @@
-! This test checks lowering of OpenMP DO Directive (Worksharing) with
-! monotonic schedule modifier.
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-program wsloop_dynamic
-  integer :: i
-!CHECK-LABEL: func @_QQmain()
-
-!$OMP PARALLEL
-!CHECK:  omp.parallel {
-
-!$OMP DO SCHEDULE(monotonic:dynamic)
-!CHECK:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
-!CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
-!CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
-!CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-!CHECK:      omp.wsloop schedule(dynamic, monotonic) nowait {
-!CHECK-NEXT:   omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-!CHECK:          fir.store %[[I]] to %[[ALLOCA_IV:.*]] : !fir.ref<i32>
-
-  do i=1, 9
-    print*, i
-!CHECK:          %[[RTBEGIN:.*]] = fir.call @_FortranAioBeginExternalListOutput
-!CHECK:          %[[LOAD:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-!CHECK:          fir.call @_FortranAioOutputInteger32(%[[RTBEGIN]], %[[LOAD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-!CHECK:          fir.call @_FortranAioEndIoStatement(%[[RTBEGIN]]) {{.*}}: (!fir.ref<i8>) -> i32
-  end do
-!CHECK:          omp.yield
-!CHECK:        }
-!CHECK:        omp.terminator
-!CHECK:      }
-!CHECK:      omp.terminator
-!CHECK:    }
-
-!$OMP END DO NOWAIT
-!$OMP END PARALLEL
-end
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-nonmonotonic.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-nonmonotonic.f90
deleted file mode 100644
index 96a3e71f34b1..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-nonmonotonic.f90
+++ /dev/null
@@ -1,39 +0,0 @@
-! This test checks lowering of OpenMP DO Directive(Worksharing) with
-! non-monotonic schedule modifier.
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-program wsloop_dynamic
-  integer :: i
-!CHECK-LABEL: func @_QQmain()
-
-
-!$OMP PARALLEL
-!CHECK:  omp.parallel {
-
-!$OMP DO SCHEDULE(nonmonotonic:dynamic)
-!CHECK:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
-!CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
-!CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
-!CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-!CHECK:      omp.wsloop schedule(dynamic, nonmonotonic) nowait {
-!CHECK-NEXT:   omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
-!CHECK:          fir.store %[[I]] to %[[ALLOCA_IV]] : !fir.ref<i32>
-
-  do i=1, 9
-    print*, i
-!CHECK:          %[[RTBEGIN:.*]] = fir.call @_FortranAioBeginExternalListOutput
-!CHECK:          %[[LOAD:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-!CHECK:          fir.call @_FortranAioOutputInteger32(%[[RTBEGIN]], %[[LOAD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-!CHECK:          fir.call @_FortranAioEndIoStatement(%[[RTBEGIN]]) {{.*}}: (!fir.ref<i8>) -> i32
-  end do
-!CHECK:          omp.yield
-!CHECK:        }
-!CHECK:        omp.terminator
-!CHECK:      }
-!CHECK:    omp.terminator
-!CHECK:  }
-
-!$OMP END DO NOWAIT
-!$OMP END PARALLEL
-end
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-ordered.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-ordered.f90
deleted file mode 100644
index fec027608d99..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-ordered.f90
+++ /dev/null
@@ -1,46 +0,0 @@
-! This test checks lowering of worksharing-loop construct with ordered clause.
-
-! RUN: bbc -fopenmp -emit-fir %s -o - | FileCheck %s
-
-! This checks lowering ordered clause specified without parameter
-subroutine wsloop_ordered_no_para()
-  integer :: a(10), i
-
-! CHECK:      omp.wsloop ordered(0) {
-! CHECK-NEXT:   omp.loop_nest (%{{.*}}) : i32 = (%{{.*}}) to (%{{.*}}) inclusive step (%{{.*}}) {
-! CHECK:          omp.yield
-! CHECK:        }
-! CHECK:        omp.terminator
-! CHECK:      }
-
-  !$omp do ordered
-  do i = 2, 10
-    !$omp ordered
-    a(i) = a(i-1) + 1
-    !$omp end ordered
-  end do
-  !$omp end do
-
-end
-
-! This checks lowering ordered clause specified with a parameter
-subroutine wsloop_ordered_with_para()
-  integer :: a(10), i
-
-! CHECK: func @_QPwsloop_ordered_with_para() {
-! CHECK:      omp.wsloop ordered(1) {
-! CHECK-NEXT:   omp.loop_nest (%{{.*}}) : i32 = (%{{.*}}) to (%{{.*}}) inclusive step (%{{.*}}) {
-! CHECK:          omp.yield
-! CHECK:        }
-! CHECK:        omp.terminator
-! CHECK:      }
-
-  !$omp do ordered(1)
-  do i = 2, 10
-    !!$omp ordered depend(sink: i-1)
-    a(i) = a(i-1) + 1
-    !!$omp ordered depend(source)
-  end do
-  !$omp end do
-
-end
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add-byref.f90
deleted file mode 100644
index b6dfec09007e..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add-byref.f90
+++ /dev/null
@@ -1,413 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
-! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_f64 : !fir.ref<f64>
-! CHECK-SAME:    init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f64>):
-! CHECK:            %[[C0_1:.*]] = arith.constant 0.000000e+00 : f64
-! CHECK:            %[[REF:.*]] = fir.alloca f64
-! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<f64>
-! CHECK:           omp.yield(%[[REF]] : !fir.ref<f64>)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<f64>, %[[ARG1:.*]]: !fir.ref<f64>):
-! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f64>
-! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f64>
-! CHECK:           %[[RES:.*]] = arith.addf %[[LD0]], %[[LD1]] fastmath<contract> : f64
-! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f64>
-! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<f64>)
-! CHECK:         }
-
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_i64 : !fir.ref<i64>
-! CHECK-SAME:    init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i64>):
-! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i64
-! CHECK:            %[[REF:.*]] = fir.alloca i64
-! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i64>
-! CHECK:           omp.yield(%[[REF]] : !fir.ref<i64>)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i64>, %[[ARG1:.*]]: !fir.ref<i64>):
-! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i64>
-! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i64>
-! CHECK:           %[[RES:.*]] = arith.addi %[[LD0]], %[[LD1]] : i64
-! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i64>
-! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i64>)
-! CHECK:         }
-
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_f32 : !fir.ref<f32>
-! CHECK-SAME:    init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>):
-! CHECK:            %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:            %[[REF:.*]] = fir.alloca f32
-! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<f32>
-! CHECK:           omp.yield(%[[REF]] : !fir.ref<f32>)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
-! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
-! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
-! CHECK:           %[[RES:.*]] = arith.addf %[[LD0]], %[[LD1]] fastmath<contract> : f32
-! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
-! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<f32>)
-! CHECK:         }
-
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_i32 : !fir.ref<i32>
-! CHECK-SAME:    init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
-! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i32
-! CHECK:            %[[REF:.*]] = fir.alloca i32
-! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
-! CHECK:           omp.yield(%[[REF]] : !fir.ref<i32>)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-! CHECK:           %[[RES:.*]] = arith.addi %[[LD0]], %[[LD1]] : i32
-! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
-! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
-! CHECK:         }
-
-! CHECK-LABEL:   func.func @_QPsimple_int_reduction() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reductionEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"}
-! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<i32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_i32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<i32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
-! CHECK:                 fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_9:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_10:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i32
-! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine simple_int_reduction
-  integer :: x
-  x = 0
-  !$omp parallel
-  !$omp do reduction(+:x)
-  do i=1, 100
-    x = x + i
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-
-! CHECK-LABEL:   func.func @_QPsimple_real_reduction() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reductionEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reductionEx"}
-! CHECK:           %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<f32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_f32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<f32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
-! CHECK:                 fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_9:.*]] = fir.load %[[VAL_7]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_10:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i32) -> f32
-! CHECK:                 %[[VAL_12:.*]] = arith.addf %[[VAL_9]], %[[VAL_11]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_12]] to %[[VAL_7]] : !fir.ref<f32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine simple_real_reduction
-  real :: x
-  x = 0.0
-  !$omp parallel
-  !$omp do reduction(+:x)
-  do i=1, 100
-    x = x + i
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPsimple_int_reduction_switch_order() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reduction_switch_orderEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reduction_switch_orderEx"}
-! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<i32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_i32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<i32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
-! CHECK:                 fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_9:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_10:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i32
-! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine simple_int_reduction_switch_order
-  integer :: x
-  x = 0
-  !$omp parallel
-  !$omp do reduction(+:x)
-  do i=1, 100
-    x = i + x
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPsimple_real_reduction_switch_order() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reduction_switch_orderEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reduction_switch_orderEx"}
-! CHECK:           %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<f32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_f32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<f32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
-! CHECK:                 fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_9:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> f32
-! CHECK:                 %[[VAL_11:.*]] = fir.load %[[VAL_7]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_12:.*]] = arith.addf %[[VAL_10]], %[[VAL_11]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_12]] to %[[VAL_7]] : !fir.ref<f32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine simple_real_reduction_switch_order
-  real :: x
-  x = 0.0
-  !$omp parallel
-  !$omp do reduction(+:x)
-  do i=1, 100
-    x = i + x
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPmultiple_int_reductions_same_type() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_int_reductions_same_typeEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_int_reductions_same_typeEx"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFmultiple_int_reductions_same_typeEy"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFmultiple_int_reductions_same_typeEz"}
-! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_1]] : !fir.ref<i32>
-! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref<i32>
-! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_9:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_i32 %[[VAL_1]] -> %[[VAL_11:.*]] : !fir.ref<i32>, @add_reduction_byref_i32 %[[VAL_2]] -> %[[VAL_12:.*]] : !fir.ref<i32>, @add_reduction_byref_i32 %[[VAL_3]] -> %[[VAL_13:.*]] : !fir.ref<i32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_14:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
-! CHECK:                 fir.store %[[VAL_14]] to %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_15:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_17:.*]] = arith.addi %[[VAL_15]], %[[VAL_16]] : i32
-! CHECK:                 fir.store %[[VAL_17]] to %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_12]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_19:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_20:.*]] = arith.addi %[[VAL_18]], %[[VAL_19]] : i32
-! CHECK:                 fir.store %[[VAL_20]] to %[[VAL_12]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_21:.*]] = fir.load %[[VAL_13]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_22:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_23:.*]] = arith.addi %[[VAL_21]], %[[VAL_22]] : i32
-! CHECK:                 fir.store %[[VAL_23]] to %[[VAL_13]] : !fir.ref<i32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine multiple_int_reductions_same_type
-  integer :: x,y,z
-  x = 0
-  y = 0
-  z = 0
-  !$omp parallel
-  !$omp do reduction(+:x,y,z)
-  do i=1, 100
-    x = x + i
-    y = y + i
-    z = z + i
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPmultiple_real_reductions_same_type() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_real_reductions_same_typeEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFmultiple_real_reductions_same_typeEx"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "y", uniq_name = "_QFmultiple_real_reductions_same_typeEy"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_real_reductions_same_typeEz"}
-! CHECK:           %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_1]] : !fir.ref<f32>
-! CHECK:           %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref<f32>
-! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref<f32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_9:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_f32 %[[VAL_1]] -> %[[VAL_11:.*]] : !fir.ref<f32>, @add_reduction_byref_f32 %[[VAL_2]] -> %[[VAL_12:.*]] : !fir.ref<f32>, @add_reduction_byref_f32 %[[VAL_3]] -> %[[VAL_13:.*]] : !fir.ref<f32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_14:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
-! CHECK:                 fir.store %[[VAL_14]] to %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_15:.*]] = fir.load %[[VAL_11]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i32) -> f32
-! CHECK:                 %[[VAL_18:.*]] = arith.addf %[[VAL_15]], %[[VAL_17]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_18]] to %[[VAL_11]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_19:.*]] = fir.load %[[VAL_12]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_20:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> f32
-! CHECK:                 %[[VAL_22:.*]] = arith.addf %[[VAL_19]], %[[VAL_21]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_22]] to %[[VAL_12]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_23:.*]] = fir.load %[[VAL_13]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_24:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (i32) -> f32
-! CHECK:                 %[[VAL_26:.*]] = arith.addf %[[VAL_23]], %[[VAL_25]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_26]] to %[[VAL_13]] : !fir.ref<f32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine multiple_real_reductions_same_type
-  real :: x,y,z
-  x = 0.0
-  y = 0.0
-  z = 0.0
-  !$omp parallel
-  !$omp do reduction(+:x,y,z)
-  do i=1, 100
-    x = x + i
-    y = y + i
-    z = z + i
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPmultiple_reductions_different_type() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductions_different_typeEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca f64 {bindc_name = "w", uniq_name = "_QFmultiple_reductions_different_typeEw"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_reductions_different_typeEx"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca i64 {bindc_name = "y", uniq_name = "_QFmultiple_reductions_different_typeEy"}
-! CHECK:           %[[VAL_4:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_reductions_different_typeEz"}
-! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref<i32>
-! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref<i64>
-! CHECK:           %[[VAL_7:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_7]] to %[[VAL_4]] : !fir.ref<f32>
-! CHECK:           %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f64
-! CHECK:           fir.store %[[VAL_8]] to %[[VAL_1]] : !fir.ref<f64>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_9:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_11:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@add_reduction_byref_i32 %[[VAL_2]] -> %[[VAL_13:.*]] : !fir.ref<i32>, @add_reduction_byref_i64 %[[VAL_3]] -> %[[VAL_14:.*]] : !fir.ref<i64>, @add_reduction_byref_f32 %[[VAL_4]] -> %[[VAL_15:.*]] : !fir.ref<f32>, @add_reduction_byref_f64 %[[VAL_1]] -> %[[VAL_16:.*]] : !fir.ref<f64>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_17:.*]]) : i32 = (%[[VAL_10]]) to (%[[VAL_11]]) inclusive step (%[[VAL_12]]) {
-! CHECK:                 fir.store %[[VAL_17]] to %[[VAL_9]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_13]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_19:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_20:.*]] = arith.addi %[[VAL_18]], %[[VAL_19]] : i32
-! CHECK:                 fir.store %[[VAL_20]] to %[[VAL_13]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_21:.*]] = fir.load %[[VAL_14]] : !fir.ref<i64>
-! CHECK:                 %[[VAL_22:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (i32) -> i64
-! CHECK:                 %[[VAL_24:.*]] = arith.addi %[[VAL_21]], %[[VAL_23]] : i64
-! CHECK:                 fir.store %[[VAL_24]] to %[[VAL_14]] : !fir.ref<i64>
-! CHECK:                 %[[VAL_25:.*]] = fir.load %[[VAL_15]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_26:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_27:.*]] = fir.convert %[[VAL_26]] : (i32) -> f32
-! CHECK:                 %[[VAL_28:.*]] = arith.addf %[[VAL_25]], %[[VAL_27]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_28]] to %[[VAL_15]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_29:.*]] = fir.load %[[VAL_16]] : !fir.ref<f64>
-! CHECK:                 %[[VAL_30:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (i32) -> f64
-! CHECK:                 %[[VAL_32:.*]] = arith.addf %[[VAL_29]], %[[VAL_31]] fastmath<contract> : f64
-! CHECK:                 fir.store %[[VAL_32]] to %[[VAL_16]] : !fir.ref<f64>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-
-subroutine multiple_reductions_different_type
-  integer :: x
-  integer(kind=8) :: y
-  real :: z
-  real(kind=8) :: w
-  x = 0
-  y = 0
-  z = 0.0
-  w = 0.0
-  !$omp parallel
-  !$omp do reduction(+:x,y,z,w)
-  do i=1, 100
-    x = x + i
-    y = y + i
-    z = z + i
-    w = w + i
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90
deleted file mode 100644
index e0b9330b1a6d..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90
+++ /dev/null
@@ -1,388 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_f64 : f64 init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: f64):
-! CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
-! CHECK:           omp.yield(%[[VAL_1]] : f64)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: f64, %[[VAL_1:.*]]: f64):
-! CHECK:           %[[VAL_2:.*]] = arith.addf %[[VAL_0]], %[[VAL_1]] fastmath<contract> : f64
-! CHECK:           omp.yield(%[[VAL_2]] : f64)
-! CHECK:         }
-
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_i64 : i64 init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: i64):
-! CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i64
-! CHECK:           omp.yield(%[[VAL_1]] : i64)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: i64, %[[VAL_1:.*]]: i64):
-! CHECK:           %[[VAL_2:.*]] = arith.addi %[[VAL_0]], %[[VAL_1]] : i64
-! CHECK:           omp.yield(%[[VAL_2]] : i64)
-! CHECK:         }
-
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_f32 : f32 init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: f32):
-! CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           omp.yield(%[[VAL_1]] : f32)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: f32, %[[VAL_1:.*]]: f32):
-! CHECK:           %[[VAL_2:.*]] = arith.addf %[[VAL_0]], %[[VAL_1]] fastmath<contract> : f32
-! CHECK:           omp.yield(%[[VAL_2]] : f32)
-! CHECK:         }
-
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_i32 : i32 init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: i32):
-! CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i32
-! CHECK:           omp.yield(%[[VAL_1]] : i32)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: i32, %[[VAL_1:.*]]: i32):
-! CHECK:           %[[VAL_2:.*]] = arith.addi %[[VAL_0]], %[[VAL_1]] : i32
-! CHECK:           omp.yield(%[[VAL_2]] : i32)
-! CHECK:         }
-
-! CHECK-LABEL:   func.func @_QPsimple_int_reduction() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reductionEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"}
-! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<i32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_i32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<i32>) 
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
-! CHECK:                 fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_9:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_10:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i32
-! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine simple_int_reduction
-  integer :: x
-  x = 0
-  !$omp parallel
-  !$omp do reduction(+:x)
-  do i=1, 100
-    x = x + i
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-
-! CHECK-LABEL:   func.func @_QPsimple_real_reduction() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reductionEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reductionEx"}
-! CHECK:           %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<f32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_f32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<f32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
-! CHECK:                 fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_9:.*]] = fir.load %[[VAL_7]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_10:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i32) -> f32
-! CHECK:                 %[[VAL_12:.*]] = arith.addf %[[VAL_9]], %[[VAL_11]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_12]] to %[[VAL_7]] : !fir.ref<f32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine simple_real_reduction
-  real :: x
-  x = 0.0
-  !$omp parallel
-  !$omp do reduction(+:x)
-  do i=1, 100
-    x = x + i
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPsimple_int_reduction_switch_order() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reduction_switch_orderEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reduction_switch_orderEx"}
-! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<i32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_i32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<i32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
-! CHECK:                 fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_9:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_10:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i32
-! CHECK:                 fir.store %[[VAL_11]] to %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine simple_int_reduction_switch_order
-  integer :: x
-  x = 0
-  !$omp parallel
-  !$omp do reduction(+:x)
-  do i=1, 100
-    x = i + x
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPsimple_real_reduction_switch_order() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reduction_switch_orderEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reduction_switch_orderEx"}
-! CHECK:           %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<f32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_f32 %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<f32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
-! CHECK:                 fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_9:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> f32
-! CHECK:                 %[[VAL_11:.*]] = fir.load %[[VAL_7]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_12:.*]] = arith.addf %[[VAL_10]], %[[VAL_11]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_12]] to %[[VAL_7]] : !fir.ref<f32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine simple_real_reduction_switch_order
-  real :: x
-  x = 0.0
-  !$omp parallel
-  !$omp do reduction(+:x)
-  do i=1, 100
-    x = i + x
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPmultiple_int_reductions_same_type() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_int_reductions_same_typeEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_int_reductions_same_typeEx"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFmultiple_int_reductions_same_typeEy"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFmultiple_int_reductions_same_typeEz"}
-! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_1]] : !fir.ref<i32>
-! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref<i32>
-! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_9:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_i32 %[[VAL_1]] -> %[[VAL_11:.*]] : !fir.ref<i32>, @add_reduction_i32 %[[VAL_2]] -> %[[VAL_12:.*]] : !fir.ref<i32>, @add_reduction_i32 %[[VAL_3]] -> %[[VAL_13:.*]] : !fir.ref<i32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_14:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
-! CHECK:                 fir.store %[[VAL_14]] to %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_15:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_17:.*]] = arith.addi %[[VAL_15]], %[[VAL_16]] : i32
-! CHECK:                 fir.store %[[VAL_17]] to %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_12]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_19:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_20:.*]] = arith.addi %[[VAL_18]], %[[VAL_19]] : i32
-! CHECK:                 fir.store %[[VAL_20]] to %[[VAL_12]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_21:.*]] = fir.load %[[VAL_13]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_22:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_23:.*]] = arith.addi %[[VAL_21]], %[[VAL_22]] : i32
-! CHECK:                 fir.store %[[VAL_23]] to %[[VAL_13]] : !fir.ref<i32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine multiple_int_reductions_same_type
-  integer :: x,y,z
-  x = 0
-  y = 0
-  z = 0
-  !$omp parallel
-  !$omp do reduction(+:x,y,z)
-  do i=1, 100
-    x = x + i
-    y = y + i
-    z = z + i
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPmultiple_real_reductions_same_type() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_real_reductions_same_typeEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFmultiple_real_reductions_same_typeEx"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "y", uniq_name = "_QFmultiple_real_reductions_same_typeEy"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_real_reductions_same_typeEz"}
-! CHECK:           %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_1]] : !fir.ref<f32>
-! CHECK:           %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref<f32>
-! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref<f32>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_9:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_f32 %[[VAL_1]] -> %[[VAL_11:.*]] : !fir.ref<f32>, @add_reduction_f32 %[[VAL_2]] -> %[[VAL_12:.*]] : !fir.ref<f32>, @add_reduction_f32 %[[VAL_3]] -> %[[VAL_13:.*]] : !fir.ref<f32>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_14:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
-! CHECK:                 fir.store %[[VAL_14]] to %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_15:.*]] = fir.load %[[VAL_11]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i32) -> f32
-! CHECK:                 %[[VAL_18:.*]] = arith.addf %[[VAL_15]], %[[VAL_17]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_18]] to %[[VAL_11]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_19:.*]] = fir.load %[[VAL_12]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_20:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> f32
-! CHECK:                 %[[VAL_22:.*]] = arith.addf %[[VAL_19]], %[[VAL_21]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_22]] to %[[VAL_12]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_23:.*]] = fir.load %[[VAL_13]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_24:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (i32) -> f32
-! CHECK:                 %[[VAL_26:.*]] = arith.addf %[[VAL_23]], %[[VAL_25]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_26]] to %[[VAL_13]] : !fir.ref<f32>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-subroutine multiple_real_reductions_same_type
-  real :: x,y,z
-  x = 0.0
-  y = 0.0
-  z = 0.0
-  !$omp parallel
-  !$omp do reduction(+:x,y,z)
-  do i=1, 100
-    x = x + i
-    y = y + i
-    z = z + i
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPmultiple_reductions_different_type() {
-! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductions_different_typeEi"}
-! CHECK:           %[[VAL_1:.*]] = fir.alloca f64 {bindc_name = "w", uniq_name = "_QFmultiple_reductions_different_typeEw"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_reductions_different_typeEx"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca i64 {bindc_name = "y", uniq_name = "_QFmultiple_reductions_different_typeEy"}
-! CHECK:           %[[VAL_4:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_reductions_different_typeEz"}
-! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i32
-! CHECK:           fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref<i32>
-! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref<i64>
-! CHECK:           %[[VAL_7:.*]] = arith.constant 0.000000e+00 : f32
-! CHECK:           fir.store %[[VAL_7]] to %[[VAL_4]] : !fir.ref<f32>
-! CHECK:           %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f64
-! CHECK:           fir.store %[[VAL_8]] to %[[VAL_1]] : !fir.ref<f64>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_9:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_11:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@add_reduction_i32 %[[VAL_2]] -> %[[VAL_13:.*]] : !fir.ref<i32>, @add_reduction_i64 %[[VAL_3]] -> %[[VAL_14:.*]] : !fir.ref<i64>, @add_reduction_f32 %[[VAL_4]] -> %[[VAL_15:.*]] : !fir.ref<f32>, @add_reduction_f64 %[[VAL_1]] -> %[[VAL_16:.*]] : !fir.ref<f64>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_17:.*]]) : i32 = (%[[VAL_10]]) to (%[[VAL_11]]) inclusive step (%[[VAL_12]]) {
-! CHECK:                 fir.store %[[VAL_17]] to %[[VAL_9]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_18:.*]] = fir.load %[[VAL_13]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_19:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_20:.*]] = arith.addi %[[VAL_18]], %[[VAL_19]] : i32
-! CHECK:                 fir.store %[[VAL_20]] to %[[VAL_13]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_21:.*]] = fir.load %[[VAL_14]] : !fir.ref<i64>
-! CHECK:                 %[[VAL_22:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (i32) -> i64
-! CHECK:                 %[[VAL_24:.*]] = arith.addi %[[VAL_21]], %[[VAL_23]] : i64
-! CHECK:                 fir.store %[[VAL_24]] to %[[VAL_14]] : !fir.ref<i64>
-! CHECK:                 %[[VAL_25:.*]] = fir.load %[[VAL_15]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_26:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_27:.*]] = fir.convert %[[VAL_26]] : (i32) -> f32
-! CHECK:                 %[[VAL_28:.*]] = arith.addf %[[VAL_25]], %[[VAL_27]] fastmath<contract> : f32
-! CHECK:                 fir.store %[[VAL_28]] to %[[VAL_15]] : !fir.ref<f32>
-! CHECK:                 %[[VAL_29:.*]] = fir.load %[[VAL_16]] : !fir.ref<f64>
-! CHECK:                 %[[VAL_30:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (i32) -> f64
-! CHECK:                 %[[VAL_32:.*]] = arith.addf %[[VAL_29]], %[[VAL_31]] fastmath<contract> : f64
-! CHECK:                 fir.store %[[VAL_32]] to %[[VAL_16]] : !fir.ref<f64>
-! CHECK:                 omp.yield
-! CHECK:               }
-! CHECK:               omp.terminator
-! CHECK:             }
-! CHECK:             omp.terminator
-! CHECK:           }
-! CHECK:           return
-! CHECK:         }
-
-
-subroutine multiple_reductions_different_type
-  integer :: x
-  integer(kind=8) :: y
-  real :: z
-  real(kind=8) :: w
-  x = 0
-  y = 0
-  z = 0.0
-  w = 0.0
-  !$omp parallel
-  !$omp do reduction(+:x,y,z,w)
-  do i=1, 100
-    x = x + i
-    y = y + i
-    z = z + i
-    w = w + i
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-iand-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-iand-byref.f90
deleted file mode 100644
index b25ab84f60fe..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-iand-byref.f90
+++ /dev/null
@@ -1,48 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
-
-!CHECK-LABEL:   omp.declare_reduction @iand_byref_i32 : !fir.ref<i32>
-!CHECK-SAME:    init {
-!CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
-!CHECK:            %[[C0_1:.*]] = arith.constant -1 : i32
-!CHECK:            %[[REF:.*]] = fir.alloca i32
-!CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
-!CHECK:            omp.yield(%[[REF]] : !fir.ref<i32>)
-
-!CHECK-LABEL:   } combiner {
-!CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-!CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-!CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-!CHECK:           %[[RES:.*]] = arith.andi %[[LD0]], %[[LD1]] : i32
-!CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
-!CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
-!CHECK:         }
-
-
-!CHECK-LABEL: @_QPreduction_iand
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
-!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iandEx"}
-!CHECK: omp.parallel
-!CHECK: omp.wsloop byref reduction(@iand_byref_i32 %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>)
-!CHECK-NEXT: omp.loop_nest
-!CHECK: %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK: %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
-!CHECK: %[[RES:.+]] = arith.andi %[[LPRV]], %[[Y_I]] : i32
-!CHECK: fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
-!CHECK: omp.yield
-!CHECK: omp.terminator
-!CHECK: omp.terminator
-
-subroutine reduction_iand(y)
-  integer :: x, y(:)
-  x = 0
-  !$omp parallel
-  !$omp do reduction(iand:x)
-  do i=1, 100
-    x = iand(x, y(i))
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-iand.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-iand.f90
deleted file mode 100644
index dfc140d7d5f6..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-iand.f90
+++ /dev/null
@@ -1,38 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-!CHECK: omp.declare_reduction @[[IAND_DECLARE_I:.*]] : i32 init {
-!CHECK: %[[ZERO_VAL_I:.*]] = arith.constant -1 : i32
-!CHECK: omp.yield(%[[ZERO_VAL_I]] : i32)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0_I:.*]]: i32, %[[ARG1_I:.*]]: i32):
-!CHECK: %[[IAND_VAL_I:.*]] = arith.andi %[[ARG0_I]], %[[ARG1_I]] : i32
-!CHECK: omp.yield(%[[IAND_VAL_I]] : i32)
-
-!CHECK-LABEL: @_QPreduction_iand
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
-!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iandEx"}
-!CHECK: omp.parallel
-!CHECK: omp.wsloop reduction(@[[IAND_DECLARE_I]] %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>)
-!CHECK-NEXT: omp.loop_nest
-!CHECK: %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK: %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
-!CHECK: %[[RES:.+]] = arith.andi %[[LPRV]], %[[Y_I]] : i32
-!CHECK: fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
-!CHECK: omp.yield
-!CHECK: omp.terminator
-!CHECK: omp.terminator
-
-subroutine reduction_iand(y)
-  integer :: x, y(:)
-  x = 0
-  !$omp parallel
-  !$omp do reduction(iand:x)
-  do i=1, 100
-    x = iand(x, y(i))
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ieor-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ieor-byref.f90
deleted file mode 100644
index 56eb087bae5a..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ieor-byref.f90
+++ /dev/null
@@ -1,47 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -mmlir --force-byref-reduction -fopenmp %s -o - | FileCheck %s
-
-! CHECK-LABEL:   omp.declare_reduction @ieor_byref_i32 : !fir.ref<i32>
-! CHECK-SAME:    init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
-! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i32
-! CHECK:            %[[REF:.*]] = fir.alloca i32
-! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
-! CHECK:            omp.yield(%[[REF]] : !fir.ref<i32>)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-! CHECK:           %[[RES:.*]] = arith.xori %[[LD0]], %[[LD1]] : i32
-! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
-! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
-! CHECK:         }
-
-!CHECK-LABEL: @_QPreduction_ieor
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
-!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_ieorEx"}
-!CHECK: omp.parallel
-!CHECK: omp.wsloop byref reduction(@ieor_byref_i32 %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>)
-!CHECK-NEXT: omp.loop_nest
-!CHECK: %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK: %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
-!CHECK: %[[RES:.+]] = arith.xori %[[LPRV]], %[[Y_I]] : i32
-!CHECK: fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
-!CHECK: omp.yield
-!CHECK: omp.terminator
-!CHECK: omp.terminator
-
-subroutine reduction_ieor(y)
-  integer :: x, y(:)
-  x = 0
-  !$omp parallel
-  !$omp do reduction(ieor:x)
-  do i=1, 100
-    x = ieor(x, y(i))
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ieor.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ieor.f90
deleted file mode 100644
index 1ddf82b828cb..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ieor.f90
+++ /dev/null
@@ -1,38 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-!CHECK: omp.declare_reduction @[[IEOR_DECLARE_I:.*]] : i32 init {
-!CHECK: %[[ZERO_VAL_I:.*]] = arith.constant 0 : i32
-!CHECK: omp.yield(%[[ZERO_VAL_I]] : i32)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0_I:.*]]: i32, %[[ARG1_I:.*]]: i32):
-!CHECK: %[[IEOR_VAL_I:.*]] = arith.xori %[[ARG0_I]], %[[ARG1_I]] : i32
-!CHECK: omp.yield(%[[IEOR_VAL_I]] : i32)
-
-!CHECK-LABEL: @_QPreduction_ieor
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
-!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_ieorEx"}
-!CHECK: omp.parallel
-!CHECK: omp.wsloop reduction(@[[IEOR_DECLARE_I]] %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>)
-!CHECK-NEXT: omp.loop_nest
-!CHECK: %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK: %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
-!CHECK: %[[RES:.+]] = arith.xori %[[LPRV]], %[[Y_I]] : i32
-!CHECK: fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
-!CHECK: omp.yield
-!CHECK: omp.terminator
-!CHECK: omp.terminator
-
-subroutine reduction_ieor(y)
-  integer :: x, y(:)
-  x = 0
-  !$omp parallel
-  !$omp do reduction(ieor:x)
-  do i=1, 100
-    x = ieor(x, y(i))
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ior-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ior-byref.f90
deleted file mode 100644
index e761d24cd303..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ior-byref.f90
+++ /dev/null
@@ -1,47 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
-
-! CHECK-LABEL:   omp.declare_reduction @ior_byref_i32 : !fir.ref<i32>
-! CHECK-SAME:    init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
-! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i32
-! CHECK:            %[[REF:.*]] = fir.alloca i32
-! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
-! CHECK:            omp.yield(%[[REF]] : !fir.ref<i32>)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-! CHECK:           %[[RES:.*]] = arith.ori %[[LD0]], %[[LD1]] : i32
-! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
-! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
-! CHECK:         }
-
-!CHECK-LABEL: @_QPreduction_ior
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>> 
-!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iorEx"}
-!CHECK: omp.parallel
-!CHECK: omp.wsloop byref reduction(@ior_byref_i32 %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>)
-!CHECK-NEXT: omp.loop_nest
-!CHECK: %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK: %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
-!CHECK: %[[RES:.+]] = arith.ori %[[LPRV]], %[[Y_I]] : i32
-!CHECK: fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
-!CHECK: omp.yield
-!CHECK: omp.terminator
-!CHECK: omp.terminator
-
-subroutine reduction_ior(y)
-  integer :: x, y(:)
-  x = 0
-  !$omp parallel
-  !$omp do reduction(ior:x)
-  do i=1, 100
-    x = ior(x, y(i))
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ior.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ior.f90
deleted file mode 100644
index 148dbc909bab..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ior.f90
+++ /dev/null
@@ -1,38 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-!CHECK: omp.declare_reduction @[[IOR_DECLARE_I:.*]] : i32 init {
-!CHECK: %[[ZERO_VAL_I:.*]] = arith.constant 0 : i32
-!CHECK: omp.yield(%[[ZERO_VAL_I]] : i32)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0_I:.*]]: i32, %[[ARG1_I:.*]]: i32):
-!CHECK: %[[IOR_VAL_I:.*]] = arith.ori %[[ARG0_I]], %[[ARG1_I]] : i32
-!CHECK: omp.yield(%[[IOR_VAL_I]] : i32)
-
-!CHECK-LABEL: @_QPreduction_ior
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>> 
-!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iorEx"}
-!CHECK: omp.parallel
-!CHECK: omp.wsloop reduction(@[[IOR_DECLARE_I]] %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>)
-!CHECK-NEXT: omp.loop_nest
-!CHECK: %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK: %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
-!CHECK: %[[RES:.+]] = arith.ori %[[LPRV]], %[[Y_I]] : i32
-!CHECK: fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
-!CHECK: omp.yield
-!CHECK: omp.terminator
-!CHECK: omp.terminator
-
-subroutine reduction_ior(y)
-  integer :: x, y(:)
-  x = 0
-  !$omp parallel
-  !$omp do reduction(ior:x)
-  do i=1, 100
-    x = ior(x, y(i))
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv-byref.f90
deleted file mode 100644
index 17cd02a0ca7f..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv-byref.f90
+++ /dev/null
@@ -1,193 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
-
-! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-! CHECK-LABEL:   omp.declare_reduction @eqv_reduction : !fir.ref<!fir.logical<4>>
-! CHECK-SAME:    init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
-! CHECK:           %[[VAL_1:.*]] = arith.constant true
-! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4>
-! CHECK:           %[[REF:.*]] = fir.alloca !fir.logical<4>
-! CHECK:           fir.store %[[VAL_2]] to %[[REF]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.yield(%[[REF]] : !fir.ref<!fir.logical<4>>)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
-! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[VAL_2:.*]] = fir.convert %[[LD0]] : (!fir.logical<4>) -> i1
-! CHECK:           %[[VAL_3:.*]] = fir.convert %[[LD1]] : (!fir.logical<4>) -> i1
-! CHECK:           %[[RES:.*]] = arith.cmpi eq, %[[VAL_2]], %[[VAL_3]] : i1
-! CHECK:           %[[VAL_5:.*]] = fir.convert %[[RES]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_5]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<!fir.logical<4>>)
-! CHECK:         }
-
-! CHECK-LABEL:   func.func @_QPsimple_reduction(
-! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
-! CHECK:           %[[VAL_3:.*]] = arith.constant true
-! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@eqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
-! CHECK:                 fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_12:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i32) -> i64
-! CHECK:                 %[[VAL_14:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_15:.*]] = arith.subi %[[VAL_13]], %[[VAL_14]] : i64
-! CHECK:                 %[[VAL_16:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_15]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_17:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_18:.*]] = fir.convert %[[VAL_11]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_18]], %[[VAL_19]] : i1
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-subroutine simple_reduction(y)
-  logical :: x, y(100)
-  x = .true.
-  !$omp parallel
-  !$omp do reduction(.eqv.:x)
-  do i=1, 100
-    x = x .eqv. y(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPsimple_reduction_switch_order(
-! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
-! CHECK:           %[[VAL_3:.*]] = arith.constant true
-! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@eqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
-! CHECK:                 fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> i64
-! CHECK:                 %[[VAL_13:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_14:.*]] = arith.subi %[[VAL_12]], %[[VAL_13]] : i64
-! CHECK:                 %[[VAL_15:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_14]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_17:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_18:.*]] = fir.convert %[[VAL_16]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_18]], %[[VAL_19]] : i1
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-subroutine simple_reduction_switch_order(y)
-  logical :: x, y(100)
-  x = .true.
-  !$omp parallel
-  !$omp do reduction(.eqv.:x)
-  do i=1, 100
-  x = y(i) .eqv. x
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPmultiple_reductions(
-! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
-! CHECK:           %[[VAL_4:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
-! CHECK:           %[[VAL_5:.*]] = arith.constant true
-! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[VAL_7:.*]] = arith.constant true
-! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[VAL_9:.*]] = arith.constant true
-! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_10]] to %[[VAL_4]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@eqv_reduction %[[VAL_2]] -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>, @eqv_reduction %[[VAL_3]] -> %[[VAL_16:.*]] : !fir.ref<!fir.logical<4>>, @eqv_reduction %[[VAL_4]] -> %[[VAL_17:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_18:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
-! CHECK:                 fir.store %[[VAL_18]] to %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_19:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_20:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> i64
-! CHECK:                 %[[VAL_22:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_23:.*]] = arith.subi %[[VAL_21]], %[[VAL_22]] : i64
-! CHECK:                 %[[VAL_24:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_23]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_25:.*]] = fir.load %[[VAL_24]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_26:.*]] = fir.convert %[[VAL_19]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_27:.*]] = fir.convert %[[VAL_25]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_28:.*]] = arith.cmpi eq, %[[VAL_26]], %[[VAL_27]] : i1
-! CHECK:                 %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_29]] to %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_30:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_31:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i32) -> i64
-! CHECK:                 %[[VAL_33:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_34:.*]] = arith.subi %[[VAL_32]], %[[VAL_33]] : i64
-! CHECK:                 %[[VAL_35:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_34]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_37:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_38:.*]] = fir.convert %[[VAL_36]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_39:.*]] = arith.cmpi eq, %[[VAL_37]], %[[VAL_38]] : i1
-! CHECK:                 %[[VAL_40:.*]] = fir.convert %[[VAL_39]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_40]] to %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_41:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_42:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_43:.*]] = fir.convert %[[VAL_42]] : (i32) -> i64
-! CHECK:                 %[[VAL_44:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_45:.*]] = arith.subi %[[VAL_43]], %[[VAL_44]] : i64
-! CHECK:                 %[[VAL_46:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_45]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_48:.*]] = fir.convert %[[VAL_41]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_49:.*]] = fir.convert %[[VAL_47]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_50:.*]] = arith.cmpi eq, %[[VAL_48]], %[[VAL_49]] : i1
-! CHECK:                 %[[VAL_51:.*]] = fir.convert %[[VAL_50]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_51]] to %[[VAL_17]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-subroutine multiple_reductions(w)
-  logical :: x,y,z,w(100)
-  x = .true.
-  y = .true.
-  z = .true.
-  !$omp parallel
-  !$omp do reduction(.eqv.:x,y,z)
-  do i=1, 100
-  x = x .eqv. w(i)
-  y = y .eqv. w(i)
-  z = z .eqv. w(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv.f90
deleted file mode 100644
index e714e45540c3..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv.f90
+++ /dev/null
@@ -1,187 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-! CHECK-LABEL:   omp.declare_reduction @eqv_reduction : !fir.logical<4> init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.logical<4>):
-! CHECK:           %[[VAL_1:.*]] = arith.constant true
-! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4>
-! CHECK:           omp.yield(%[[VAL_2]] : !fir.logical<4>)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.logical<4>, %[[VAL_1:.*]]: !fir.logical<4>):
-! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_0]] : (!fir.logical<4>) -> i1
-! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_1]] : (!fir.logical<4>) -> i1
-! CHECK:           %[[VAL_4:.*]] = arith.cmpi eq, %[[VAL_2]], %[[VAL_3]] : i1
-! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i1) -> !fir.logical<4>
-! CHECK:           omp.yield(%[[VAL_5]] : !fir.logical<4>)
-! CHECK:         }
-
-! CHECK-LABEL:   func.func @_QPsimple_reduction(
-! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
-! CHECK:           %[[VAL_3:.*]] = arith.constant true
-! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@eqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
-! CHECK:                 fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_12:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i32) -> i64
-! CHECK:                 %[[VAL_14:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_15:.*]] = arith.subi %[[VAL_13]], %[[VAL_14]] : i64
-! CHECK:                 %[[VAL_16:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_15]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_17:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_18:.*]] = fir.convert %[[VAL_11]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_18]], %[[VAL_19]] : i1
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-subroutine simple_reduction(y)
-  logical :: x, y(100)
-  x = .true.
-  !$omp parallel
-  !$omp do reduction(.eqv.:x)
-  do i=1, 100
-    x = x .eqv. y(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPsimple_reduction_switch_order(
-! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
-! CHECK:           %[[VAL_3:.*]] = arith.constant true
-! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@eqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
-! CHECK:                 fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> i64
-! CHECK:                 %[[VAL_13:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_14:.*]] = arith.subi %[[VAL_12]], %[[VAL_13]] : i64
-! CHECK:                 %[[VAL_15:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_14]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_17:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_18:.*]] = fir.convert %[[VAL_16]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_18]], %[[VAL_19]] : i1
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-subroutine simple_reduction_switch_order(y)
-  logical :: x, y(100)
-  x = .true.
-  !$omp parallel
-  !$omp do reduction(.eqv.:x)
-  do i=1, 100
-  x = y(i) .eqv. x
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPmultiple_reductions(
-! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
-! CHECK:           %[[VAL_4:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
-! CHECK:           %[[VAL_5:.*]] = arith.constant true
-! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[VAL_7:.*]] = arith.constant true
-! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[VAL_9:.*]] = arith.constant true
-! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_10]] to %[[VAL_4]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@eqv_reduction %[[VAL_2]] -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>, @eqv_reduction %[[VAL_3]] -> %[[VAL_16:.*]] : !fir.ref<!fir.logical<4>>, @eqv_reduction %[[VAL_4]] -> %[[VAL_17:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_18:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
-! CHECK:                 fir.store %[[VAL_18]] to %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_19:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_20:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> i64
-! CHECK:                 %[[VAL_22:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_23:.*]] = arith.subi %[[VAL_21]], %[[VAL_22]] : i64
-! CHECK:                 %[[VAL_24:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_23]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_25:.*]] = fir.load %[[VAL_24]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_26:.*]] = fir.convert %[[VAL_19]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_27:.*]] = fir.convert %[[VAL_25]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_28:.*]] = arith.cmpi eq, %[[VAL_26]], %[[VAL_27]] : i1
-! CHECK:                 %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_29]] to %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_30:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_31:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i32) -> i64
-! CHECK:                 %[[VAL_33:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_34:.*]] = arith.subi %[[VAL_32]], %[[VAL_33]] : i64
-! CHECK:                 %[[VAL_35:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_34]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_37:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_38:.*]] = fir.convert %[[VAL_36]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_39:.*]] = arith.cmpi eq, %[[VAL_37]], %[[VAL_38]] : i1
-! CHECK:                 %[[VAL_40:.*]] = fir.convert %[[VAL_39]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_40]] to %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_41:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_42:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_43:.*]] = fir.convert %[[VAL_42]] : (i32) -> i64
-! CHECK:                 %[[VAL_44:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_45:.*]] = arith.subi %[[VAL_43]], %[[VAL_44]] : i64
-! CHECK:                 %[[VAL_46:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_45]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_48:.*]] = fir.convert %[[VAL_41]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_49:.*]] = fir.convert %[[VAL_47]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_50:.*]] = arith.cmpi eq, %[[VAL_48]], %[[VAL_49]] : i1
-! CHECK:                 %[[VAL_51:.*]] = fir.convert %[[VAL_50]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_51]] to %[[VAL_17]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-subroutine multiple_reductions(w)
-  logical :: x,y,z,w(100)
-  x = .true.
-  y = .true.
-  z = .true.
-  !$omp parallel
-  !$omp do reduction(.eqv.:x,y,z)
-  do i=1, 100
-  x = x .eqv. w(i)
-  y = y .eqv. w(i)
-  z = z .eqv. w(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv-byref.f90
deleted file mode 100644
index 89d16c3191b2..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv-byref.f90
+++ /dev/null
@@ -1,195 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
-
-! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-
-! CHECK-LABEL:   omp.declare_reduction @neqv_reduction : !fir.ref<!fir.logical<4>>
-! CHECK-SAME:    init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
-! CHECK:           %[[VAL_1:.*]] = arith.constant false
-! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4>
-! CHECK:           %[[REF:.*]] = fir.alloca !fir.logical<4>
-! CHECK:           fir.store %[[VAL_2]] to %[[REF]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.yield(%[[REF]] : !fir.ref<!fir.logical<4>>)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
-! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[VAL_2:.*]] = fir.convert %[[LD0]] : (!fir.logical<4>) -> i1
-! CHECK:           %[[VAL_3:.*]] = fir.convert %[[LD1]] : (!fir.logical<4>) -> i1
-! CHECK:           %[[RES:.*]] = arith.cmpi ne, %[[VAL_2]], %[[VAL_3]] : i1
-! CHECK:           %[[VAL_5:.*]] = fir.convert %[[RES]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_5]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<!fir.logical<4>>)
-! CHECK:         }
-
-! CHECK-LABEL:   func.func @_QPsimple_reduction(
-! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
-! CHECK:           %[[VAL_3:.*]] = arith.constant true
-! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@neqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
-! CHECK:                 fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_12:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i32) -> i64
-! CHECK:                 %[[VAL_14:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_15:.*]] = arith.subi %[[VAL_13]], %[[VAL_14]] : i64
-! CHECK:                 %[[VAL_16:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_15]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_17:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_18:.*]] = fir.convert %[[VAL_11]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_20:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_19]] : i1
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-subroutine simple_reduction(y)
-  logical :: x, y(100)
-  x = .true.
-  !$omp parallel
-  !$omp do reduction(.neqv.:x)
-  do i=1, 100
-    x = x .neqv. y(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPsimple_reduction_switch_order(
-! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
-! CHECK:           %[[VAL_3:.*]] = arith.constant true
-! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@neqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
-! CHECK:                 fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> i64
-! CHECK:                 %[[VAL_13:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_14:.*]] = arith.subi %[[VAL_12]], %[[VAL_13]] : i64
-! CHECK:                 %[[VAL_15:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_14]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_17:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_18:.*]] = fir.convert %[[VAL_16]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_20:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_19]] : i1
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-subroutine simple_reduction_switch_order(y)
-  logical :: x, y(100)
-  x = .true.
-  !$omp parallel
-  !$omp do reduction(.neqv.:x)
-  do i=1, 100
-  x = y(i) .neqv. x
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPmultiple_reductions(
-! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
-! CHECK:           %[[VAL_4:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
-! CHECK:           %[[VAL_5:.*]] = arith.constant true
-! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[VAL_7:.*]] = arith.constant true
-! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[VAL_9:.*]] = arith.constant true
-! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_10]] to %[[VAL_4]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop byref reduction(@neqv_reduction %[[VAL_2]] -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>, @neqv_reduction %[[VAL_3]] -> %[[VAL_16:.*]] : !fir.ref<!fir.logical<4>>, @neqv_reduction %[[VAL_4]] -> %[[VAL_17:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_18:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
-! CHECK:                 fir.store %[[VAL_18]] to %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_19:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_20:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> i64
-! CHECK:                 %[[VAL_22:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_23:.*]] = arith.subi %[[VAL_21]], %[[VAL_22]] : i64
-! CHECK:                 %[[VAL_24:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_23]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_25:.*]] = fir.load %[[VAL_24]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_26:.*]] = fir.convert %[[VAL_19]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_27:.*]] = fir.convert %[[VAL_25]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_28:.*]] = arith.cmpi ne, %[[VAL_26]], %[[VAL_27]] : i1
-! CHECK:                 %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_29]] to %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_30:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_31:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i32) -> i64
-! CHECK:                 %[[VAL_33:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_34:.*]] = arith.subi %[[VAL_32]], %[[VAL_33]] : i64
-! CHECK:                 %[[VAL_35:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_34]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_37:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_38:.*]] = fir.convert %[[VAL_36]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_39:.*]] = arith.cmpi ne, %[[VAL_37]], %[[VAL_38]] : i1
-! CHECK:                 %[[VAL_40:.*]] = fir.convert %[[VAL_39]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_40]] to %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_41:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_42:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_43:.*]] = fir.convert %[[VAL_42]] : (i32) -> i64
-! CHECK:                 %[[VAL_44:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_45:.*]] = arith.subi %[[VAL_43]], %[[VAL_44]] : i64
-! CHECK:                 %[[VAL_46:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_45]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_48:.*]] = fir.convert %[[VAL_41]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_49:.*]] = fir.convert %[[VAL_47]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_50:.*]] = arith.cmpi ne, %[[VAL_48]], %[[VAL_49]] : i1
-! CHECK:                 %[[VAL_51:.*]] = fir.convert %[[VAL_50]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_51]] to %[[VAL_17]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-
-subroutine multiple_reductions(w)
-  logical :: x,y,z,w(100)
-  x = .true.
-  y = .true.
-  z = .true.
-  !$omp parallel
-  !$omp do reduction(.neqv.:x,y,z)
-  do i=1, 100
-  x = x .neqv. w(i)
-  y = y .neqv. w(i)
-  z = z .neqv. w(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv.f90
deleted file mode 100644
index 106e867f367b..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv.f90
+++ /dev/null
@@ -1,189 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s
-
-! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-
-! CHECK-LABEL:   omp.declare_reduction @neqv_reduction : !fir.logical<4> init {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.logical<4>):
-! CHECK:           %[[VAL_1:.*]] = arith.constant false
-! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4>
-! CHECK:           omp.yield(%[[VAL_2]] : !fir.logical<4>)
-
-! CHECK-LABEL:   } combiner {
-! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.logical<4>, %[[VAL_1:.*]]: !fir.logical<4>):
-! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_0]] : (!fir.logical<4>) -> i1
-! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_1]] : (!fir.logical<4>) -> i1
-! CHECK:           %[[VAL_4:.*]] = arith.cmpi ne, %[[VAL_2]], %[[VAL_3]] : i1
-! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i1) -> !fir.logical<4>
-! CHECK:           omp.yield(%[[VAL_5]] : !fir.logical<4>)
-! CHECK:         }
-
-! CHECK-LABEL:   func.func @_QPsimple_reduction(
-! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
-! CHECK:           %[[VAL_3:.*]] = arith.constant true
-! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@neqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
-! CHECK:                 fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_12:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i32) -> i64
-! CHECK:                 %[[VAL_14:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_15:.*]] = arith.subi %[[VAL_13]], %[[VAL_14]] : i64
-! CHECK:                 %[[VAL_16:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_15]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_17:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_18:.*]] = fir.convert %[[VAL_11]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_20:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_19]] : i1
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-subroutine simple_reduction(y)
-  logical :: x, y(100)
-  x = .true.
-  !$omp parallel
-  !$omp do reduction(.neqv.:x)
-  do i=1, 100
-    x = x .neqv. y(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPsimple_reduction_switch_order(
-! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
-! CHECK:           %[[VAL_3:.*]] = arith.constant true
-! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@neqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
-! CHECK:                 fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_11:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> i64
-! CHECK:                 %[[VAL_13:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_14:.*]] = arith.subi %[[VAL_12]], %[[VAL_13]] : i64
-! CHECK:                 %[[VAL_15:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_14]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_17:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_18:.*]] = fir.convert %[[VAL_16]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_20:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_19]] : i1
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-subroutine simple_reduction_switch_order(y)
-  logical :: x, y(100)
-  x = .true.
-  !$omp parallel
-  !$omp do reduction(.neqv.:x)
-  do i=1, 100
-  x = y(i) .neqv. x
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL:   func.func @_QPmultiple_reductions(
-! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
-! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
-! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
-! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
-! CHECK:           %[[VAL_4:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
-! CHECK:           %[[VAL_5:.*]] = arith.constant true
-! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[VAL_7:.*]] = arith.constant true
-! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<!fir.logical<4>>
-! CHECK:           %[[VAL_9:.*]] = arith.constant true
-! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i1) -> !fir.logical<4>
-! CHECK:           fir.store %[[VAL_10]] to %[[VAL_4]] : !fir.ref<!fir.logical<4>>
-! CHECK:           omp.parallel {
-! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
-! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop reduction(@neqv_reduction %[[VAL_2]] -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>, @neqv_reduction %[[VAL_3]] -> %[[VAL_16:.*]] : !fir.ref<!fir.logical<4>>, @neqv_reduction %[[VAL_4]] -> %[[VAL_17:.*]] : !fir.ref<!fir.logical<4>>) {
-! CHECK-NEXT:          omp.loop_nest (%[[VAL_18:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
-! CHECK:                 fir.store %[[VAL_18]] to %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_19:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_20:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> i64
-! CHECK:                 %[[VAL_22:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_23:.*]] = arith.subi %[[VAL_21]], %[[VAL_22]] : i64
-! CHECK:                 %[[VAL_24:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_23]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_25:.*]] = fir.load %[[VAL_24]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_26:.*]] = fir.convert %[[VAL_19]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_27:.*]] = fir.convert %[[VAL_25]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_28:.*]] = arith.cmpi ne, %[[VAL_26]], %[[VAL_27]] : i1
-! CHECK:                 %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_29]] to %[[VAL_15]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_30:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_31:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i32) -> i64
-! CHECK:                 %[[VAL_33:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_34:.*]] = arith.subi %[[VAL_32]], %[[VAL_33]] : i64
-! CHECK:                 %[[VAL_35:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_34]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_37:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_38:.*]] = fir.convert %[[VAL_36]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_39:.*]] = arith.cmpi ne, %[[VAL_37]], %[[VAL_38]] : i1
-! CHECK:                 %[[VAL_40:.*]] = fir.convert %[[VAL_39]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_40]] to %[[VAL_16]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_41:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_42:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
-! CHECK:                 %[[VAL_43:.*]] = fir.convert %[[VAL_42]] : (i32) -> i64
-! CHECK:                 %[[VAL_44:.*]] = arith.constant 1 : i64
-! CHECK:                 %[[VAL_45:.*]] = arith.subi %[[VAL_43]], %[[VAL_44]] : i64
-! CHECK:                 %[[VAL_46:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_45]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 %[[VAL_48:.*]] = fir.convert %[[VAL_41]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_49:.*]] = fir.convert %[[VAL_47]] : (!fir.logical<4>) -> i1
-! CHECK:                 %[[VAL_50:.*]] = arith.cmpi ne, %[[VAL_48]], %[[VAL_49]] : i1
-! CHECK:                 %[[VAL_51:.*]] = fir.convert %[[VAL_50]] : (i1) -> !fir.logical<4>
-! CHECK:                 fir.store %[[VAL_51]] to %[[VAL_17]] : !fir.ref<!fir.logical<4>>
-! CHECK:                 omp.yield
-! CHECK:               omp.terminator
-! CHECK:             omp.terminator
-! CHECK:           return
-
-
-subroutine multiple_reductions(w)
-  logical :: x,y,z,w(100)
-  x = .true.
-  y = .true.
-  z = .true.
-  !$omp parallel
-  !$omp do reduction(.neqv.:x,y,z)
-  do i=1, 100
-  x = x .neqv. w(i)
-  y = y .neqv. w(i)
-  z = z .neqv. w(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max-byref.f90
deleted file mode 100644
index a4244d11a558..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max-byref.f90
+++ /dev/null
@@ -1,95 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s
-
-!CHECK: omp.declare_reduction @max_byref_f32 : !fir.ref<f32>
-!CHECK-SAME: init {
-!CHECK:   %[[MINIMUM_VAL:.*]] = arith.constant -3.40282347E+38 : f32
-!CHECK:   %[[REF:.*]] = fir.alloca f32
-!CHECK:   fir.store %[[MINIMUM_VAL]] to %[[REF]] : !fir.ref<f32>
-!CHECK:   omp.yield(%[[REF]] : !fir.ref<f32>)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
-!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
-!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
-!CHECK:   %[[RES:.*]] = arith.maxnumf %[[LD0]], %[[LD1]] {{.*}}: f32
-!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
-!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<f32>)
-
-!CHECK-LABEL: omp.declare_reduction @max_byref_i32 : !fir.ref<i32>
-!CHECK-SAME: init {
-!CHECK:   %[[MINIMUM_VAL:.*]] = arith.constant -2147483648 : i32
-!CHECK:   fir.store %[[MINIMUM_VAL]] to %[[REF]] : !fir.ref<i32>
-!CHECK:   omp.yield(%[[REF]] : !fir.ref<i32>)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-!CHECK:   %[[RES:.*]] = arith.maxsi %[[LD0]], %[[LD1]] : i32
-!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
-!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<i32>)
-
-!CHECK-LABEL: @_QPreduction_max_int
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
-!CHECK:   %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_max_intEx"}
-!CHECK:   omp.parallel
-!CHECK:     omp.wsloop byref reduction(@max_byref_i32 %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>)
-!CHECK-NEXT:  omp.loop_nest
-!CHECK:         %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK:         %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK:         %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
-!CHECK:         %[[RES:.+]] = arith.cmpi sgt, %[[LPRV]], %[[Y_I]] : i32
-!CHECK:         %[[SEL:.+]] = arith.select %[[RES]], %[[LPRV]], %[[Y_I]]
-!CHECK:         fir.store %[[SEL]] to %[[PRV]] : !fir.ref<i32>
-!CHECK:         omp.yield
-!CHECK:       omp.terminator
-!CHECK:     omp.terminator
-
-!CHECK-LABEL: @_QPreduction_max_real
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xf32>>
-!CHECK:   %[[X_REF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_max_realEx"}
-!CHECK:   omp.parallel
-!CHECK:     omp.wsloop byref reduction(@max_byref_f32 %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<f32>)
-!CHECK-NEXT:  omp.loop_nest
-!CHECK:         %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<f32>
-!CHECK:         %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK:         %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<f32>
-!CHECK:         %[[RES:.+]] = arith.cmpf ogt, %[[Y_I]], %[[LPRV]] {{.*}} : f32
-!CHECK:         omp.yield
-!CHECK:       omp.terminator
-!CHECK:     omp.terminator
-
-subroutine reduction_max_int(y)
-  integer :: x, y(:)
-  x = 0
-  !$omp parallel
-  !$omp do reduction(max:x)
-  do i=1, 100
-    x = max(x, y(i))
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
-
-subroutine reduction_max_real(y)
-  real :: x, y(:)
-  x = 0.0
-  !$omp parallel
-  !$omp do reduction(max:x)
-  do i=1, 100
-    x = max(y(i), x)
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-
-  !$omp parallel
-  !$omp do reduction(max:x)
-  do i=1, 100
-    !CHECK-NOT: omp.reduction
-    if (y(i) .gt. x) x = y(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max.f90
deleted file mode 100644
index e000bc36ca3f..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max.f90
+++ /dev/null
@@ -1,84 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp -o - %s 2>&1 | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-
-!CHECK: omp.declare_reduction @[[MAX_DECLARE_F:.*]] : f32 init {
-!CHECK:   %[[MINIMUM_VAL_F:.*]] = arith.constant -3.40282347E+38 : f32
-!CHECK:   omp.yield(%[[MINIMUM_VAL_F]] : f32)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0_F:.*]]: f32, %[[ARG1_F:.*]]: f32):
-!CHECK:   %[[COMB_VAL_F:.*]] = arith.maxnumf %[[ARG0_F]], %[[ARG1_F]] {{.*}}: f32
-!CHECK:   omp.yield(%[[COMB_VAL_F]] : f32)
-
-!CHECK: omp.declare_reduction @[[MAX_DECLARE_I:.*]] : i32 init {
-!CHECK:   %[[MINIMUM_VAL_I:.*]] = arith.constant -2147483648 : i32
-!CHECK:   omp.yield(%[[MINIMUM_VAL_I]] : i32)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0_I:.*]]: i32, %[[ARG1_I:.*]]: i32):
-!CHECK:   %[[COMB_VAL_I:.*]] = arith.maxsi %[[ARG0_I]], %[[ARG1_I]] : i32
-!CHECK:   omp.yield(%[[COMB_VAL_I]] : i32)
-
-!CHECK-LABEL: @_QPreduction_max_int
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
-!CHECK:   %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_max_intEx"}
-!CHECK:   omp.parallel
-!CHECK:     omp.wsloop reduction(@[[MAX_DECLARE_I]] %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>)
-!CHECK-NEXT:  omp.loop_nest
-!CHECK:         %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK:         %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK:         %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
-!CHECK:         %[[RES:.+]] = arith.cmpi sgt, %[[LPRV]], %[[Y_I]] : i32
-!CHECK:         %[[SEL:.+]] = arith.select %[[RES]], %[[LPRV]], %[[Y_I]]
-!CHECK:         fir.store %[[SEL]] to %[[PRV]] : !fir.ref<i32>
-!CHECK:         omp.yield
-!CHECK:       omp.terminator
-!CHECK:     omp.terminator
-
-!CHECK-LABEL: @_QPreduction_max_real
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xf32>>
-!CHECK:   %[[X_REF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_max_realEx"}
-!CHECK:   omp.parallel
-!CHECK:     omp.wsloop reduction(@[[MAX_DECLARE_F]] %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<f32>)
-!CHECK-NEXT:  omp.loop_nest
-!CHECK:         %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<f32>
-!CHECK:         %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK:         %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<f32>
-!CHECK:         %[[RES:.+]] = arith.cmpf ogt, %[[Y_I]], %[[LPRV]] {{.*}} : f32
-!CHECK:         omp.yield
-!CHECK:       omp.terminator
-!CHECK:     omp.terminator
-
-subroutine reduction_max_int(y)
-  integer :: x, y(:)
-  x = 0
-  !$omp parallel
-  !$omp do reduction(max:x)
-  do i=1, 100
-    x = max(x, y(i))
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
-
-subroutine reduction_max_real(y)
-  real :: x, y(:)
-  x = 0.0
-  !$omp parallel
-  !$omp do reduction(max:x)
-  do i=1, 100
-    x = max(y(i), x)
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-
-  !$omp parallel
-  !$omp do reduction(max:x)
-  do i=1, 100
-    !CHECK-NOT: omp.reduction
-    if (y(i) .gt. x) x = y(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min-byref.f90
deleted file mode 100644
index 17435e1a194c..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min-byref.f90
+++ /dev/null
@@ -1,95 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -mmlir --force-byref-reduction -fopenmp -o - %s 2>&1 | FileCheck %s
-
-!CHECK: omp.declare_reduction @min_byref_f32 : !fir.ref<f32>
-!CHECK-SAME: init {
-!CHECK:   %[[MAXIMUM_VAL:.*]] = arith.constant 3.40282347E+38 : f32
-!CHECK:   %[[REF:.*]] = fir.alloca f32
-!CHECK:   fir.store %[[MAXIMUM_VAL]] to %[[REF]] : !fir.ref<f32>
-!CHECK:   omp.yield(%[[REF]] : !fir.ref<f32>)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
-!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
-!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
-!CHECK:   %[[RES:.*]] = arith.minnumf %[[LD0]], %[[LD1]] {{.*}}: f32
-!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
-!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<f32>)
-
-!CHECK-LABEL: omp.declare_reduction @min_byref_i32 : !fir.ref<i32>
-!CHECK-SAME: init {
-!CHECK:   %[[MAXIMUM_VAL:.*]] = arith.constant 2147483647 : i32
-!CHECK:   fir.store %[[MAXIMUM_VAL]] to %[[REF]] : !fir.ref<i32>
-!CHECK:   omp.yield(%[[REF]] : !fir.ref<i32>)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
-!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
-!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
-!CHECK:   %[[RES:.*]] = arith.minsi %[[LD0]], %[[LD1]] : i32
-!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
-!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<i32>)
-
-!CHECK-LABEL: @_QPreduction_min_int
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
-!CHECK:   %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_min_intEx"}
-!CHECK:   omp.parallel
-!CHECK:     omp.wsloop byref reduction(@min_byref_i32 %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>)
-!CHECK-NEXT:  omp.loop_nest
-!CHECK:         %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK:         %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK:         %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
-!CHECK:         %[[RES:.+]] = arith.cmpi slt, %[[LPRV]], %[[Y_I]] : i32
-!CHECK:         %[[SEL:.+]] = arith.select %[[RES]], %[[LPRV]], %[[Y_I]]
-!CHECK:         fir.store %[[SEL]] to %[[PRV]] : !fir.ref<i32>
-!CHECK:         omp.yield
-!CHECK:       omp.terminator
-!CHECK:     omp.terminator
-
-!CHECK-LABEL: @_QPreduction_min_real
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xf32>>
-!CHECK:   %[[X_REF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_min_realEx"}
-!CHECK:   omp.parallel
-!CHECK:     omp.wsloop byref reduction(@min_byref_f32 %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<f32>)
-!CHECK-NEXT:  omp.loop_nest
-!CHECK:         %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<f32>
-!CHECK:         %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK:         %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<f32>
-!CHECK:         %[[RES:.+]] = arith.cmpf ogt, %[[Y_I]], %[[LPRV]] {{.*}} : f32
-!CHECK:         omp.yield
-!CHECK:       omp.terminator
-!CHECK:     omp.terminator
-
-subroutine reduction_min_int(y)
-  integer :: x, y(:)
-  x = 0
-  !$omp parallel
-  !$omp do reduction(min:x)
-  do i=1, 100
-    x = min(x, y(i))
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
-
-subroutine reduction_min_real(y)
-  real :: x, y(:)
-  x = 0.0
-  !$omp parallel
-  !$omp do reduction(min:x)
-  do i=1, 100
-    x = min(y(i), x)
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-
-  !$omp parallel
-  !$omp do reduction(min:x)
-  do i=1, 100
-  !CHECK-NOT: omp.reduction
-    if (y(i) .gt. x) x = y(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min.f90
deleted file mode 100644
index 1d18ece7297d..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min.f90
+++ /dev/null
@@ -1,84 +0,0 @@
-! RUN: bbc -emit-fir -hlfir=false -fopenmp -o - %s 2>&1 | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-
-!CHECK: omp.declare_reduction @[[MIN_DECLARE_F:.*]] : f32 init {
-!CHECK:   %[[MAXIMUM_VAL_F:.*]] = arith.constant 3.40282347E+38 : f32
-!CHECK:   omp.yield(%[[MAXIMUM_VAL_F]] : f32)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0_F:.*]]: f32, %[[ARG1_F:.*]]: f32):
-!CHECK:   %[[COMB_VAL_F:.*]] = arith.minnumf %[[ARG0_F]], %[[ARG1_F]] {{.*}}: f32
-!CHECK:   omp.yield(%[[COMB_VAL_F]] : f32)
-
-!CHECK: omp.declare_reduction @[[MIN_DECLARE_I:.*]] : i32 init {
-!CHECK:   %[[MAXIMUM_VAL_I:.*]] = arith.constant 2147483647 : i32
-!CHECK:   omp.yield(%[[MAXIMUM_VAL_I]] : i32)
-!CHECK: combiner
-!CHECK: ^bb0(%[[ARG0_I:.*]]: i32, %[[ARG1_I:.*]]: i32):
-!CHECK:   %[[COMB_VAL_I:.*]] = arith.minsi %[[ARG0_I]], %[[ARG1_I]] : i32
-!CHECK:   omp.yield(%[[COMB_VAL_I]] : i32)
-
-!CHECK-LABEL: @_QPreduction_min_int
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
-!CHECK:   %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_min_intEx"}
-!CHECK:   omp.parallel
-!CHECK:     omp.wsloop reduction(@[[MIN_DECLARE_I]] %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>)
-!CHECK-NEXT:  omp.loop_nest
-!CHECK:         %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
-!CHECK:         %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK:         %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
-!CHECK:         %[[RES:.+]] = arith.cmpi slt, %[[LPRV]], %[[Y_I]] : i32
-!CHECK:         %[[SEL:.+]] = arith.select %[[RES]], %[[LPRV]], %[[Y_I]]
-!CHECK:         fir.store %[[SEL]] to %[[PRV]] : !fir.ref<i32>
-!CHECK:         omp.yield
-!CHECK:       omp.terminator
-!CHECK:     omp.terminator
-
-!CHECK-LABEL: @_QPreduction_min_real
-!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xf32>>
-!CHECK:   %[[X_REF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_min_realEx"}
-!CHECK:   omp.parallel
-!CHECK:     omp.wsloop reduction(@[[MIN_DECLARE_F]] %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<f32>)
-!CHECK-NEXT:  omp.loop_nest
-!CHECK:         %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<f32>
-!CHECK:         %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
-!CHECK:         %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<f32>
-!CHECK:         %[[RES:.+]] = arith.cmpf ogt, %[[Y_I]], %[[LPRV]] {{.*}} : f32
-!CHECK:         omp.yield
-!CHECK:       omp.terminator
-!CHECK:     omp.terminator
-
-subroutine reduction_min_int(y)
-  integer :: x, y(:)
-  x = 0
-  !$omp parallel
-  !$omp do reduction(min:x)
-  do i=1, 100
-    x = min(x, y(i))
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
-
-subroutine reduction_min_real(y)
-  real :: x, y(:)
-  x = 0.0
-  !$omp parallel
-  !$omp do reduction(min:x)
-  do i=1, 100
-    x = min(y(i), x)
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-
-  !$omp parallel
-  !$omp do reduction(min:x)
-  do i=1, 100
-  !CHECK-NOT: omp.reduction
-    if (y(i) .gt. x) x = y(i)
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-simd.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-simd.f90
deleted file mode 100644
index 751e4c8c5709..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-simd.f90
+++ /dev/null
@@ -1,37 +0,0 @@
-! This test checks lowering of OpenMP DO Directive(Worksharing) with
-! simd schedule modifier.
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-program wsloop_dynamic
-  integer :: i
-!CHECK-LABEL: func @_QQmain()
-
-!$OMP PARALLEL
-!CHECK:  omp.parallel {
-
-!$OMP DO SCHEDULE(simd: runtime)
-!CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
-!CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
-!CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-!CHECK:      omp.wsloop schedule(runtime, simd) nowait {
-!CHECK-NEXT:   omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-!CHECK:          fir.store %[[I]] to %[[STORE:.*]] : !fir.ref<i32>
-
-  do i=1, 9
-    print*, i
-!CHECK:          %[[RTBEGIN:.*]] = fir.call @_FortranAioBeginExternalListOutput
-!CHECK:          %[[LOAD:.*]] = fir.load %[[STORE]] : !fir.ref<i32>
-!CHECK:          fir.call @_FortranAioOutputInteger32(%[[RTBEGIN]], %[[LOAD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-!CHECK:          fir.call @_FortranAioEndIoStatement(%[[RTBEGIN]]) {{.*}}: (!fir.ref<i8>) -> i32
-  end do
-!CHECK:          omp.yield
-!CHECK:        }
-!CHECK:        omp.terminator
-!CHECK:      }
-!CHECK:      omp.terminator
-!CHECK:    }
-
-!$OMP END DO NOWAIT
-!$OMP END PARALLEL
-end
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-variable.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-variable.f90
deleted file mode 100644
index 4bd876012278..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop-variable.f90
+++ /dev/null
@@ -1,187 +0,0 @@
-! This test checks lowering of OpenMP DO Directive(Worksharing) for different
-! types of loop iteration variable, lower bound, upper bound, and step.
-
-!REQUIRES: shell
-!RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - 2>&1 | FileCheck %s
-
-!CHECK:  OpenMP loop iteration variable cannot have more than 64 bits size and will be narrowed into 64 bits.
-
-program wsloop_variable
-  integer(kind=1) :: i1_lb, i1_ub
-  integer(kind=2) :: i2, i2_ub, i2_s
-  integer(kind=4) :: i4_s
-  integer(kind=8) :: i8, i8_s
-  integer(kind=16) :: i16, i16_lb
-  real :: x
-
-!CHECK:      %[[TMP0:.*]] = arith.constant 1 : i32
-!CHECK:      %[[TMP1:.*]] = arith.constant 100 : i32
-!CHECK:      %[[TMP2:.*]] = fir.convert %[[TMP0]] : (i32) -> i64
-!CHECK:      %[[TMP3:.*]] = fir.convert %{{.*}} : (i8) -> i64
-!CHECK:      %[[TMP4:.*]] = fir.convert %{{.*}} : (i16) -> i64
-!CHECK:      %[[TMP5:.*]] = fir.convert %{{.*}} : (i128) -> i64
-!CHECK:      %[[TMP6:.*]] = fir.convert %[[TMP1]] : (i32) -> i64
-!CHECK:      %[[TMP7:.*]] = fir.convert %{{.*}} : (i32) -> i64
-!CHECK:      omp.wsloop {
-!CHECK-NEXT:   omp.loop_nest (%[[ARG0:.*]], %[[ARG1:.*]]) : i64 = (%[[TMP2]], %[[TMP5]]) to (%[[TMP3]], %[[TMP6]]) inclusive step (%[[TMP4]], %[[TMP7]]) {
-!CHECK:          %[[ARG0_I16:.*]] = fir.convert %[[ARG0]] : (i64) -> i16
-!CHECK:          fir.store %[[ARG0_I16]] to %[[STORE_IV0:.*]] : !fir.ref<i16>
-!CHECK:          fir.store %[[ARG1]] to %[[STORE_IV1:.*]] : !fir.ref<i64>
-!CHECK:          %[[LOAD_IV0:.*]] = fir.load %[[STORE_IV0]] : !fir.ref<i16>
-!CHECK:          %[[LOAD_IV0_I64:.*]] = fir.convert %[[LOAD_IV0]] : (i16) -> i64
-!CHECK:          %[[LOAD_IV1:.*]] = fir.load %[[STORE_IV1]] : !fir.ref<i64>
-!CHECK:          %[[TMP10:.*]] = arith.addi %[[LOAD_IV0_I64]], %[[LOAD_IV1]] : i64
-!CHECK:          %[[TMP11:.*]] = fir.convert %[[TMP10]] : (i64) -> f32
-!CHECK:          fir.store %[[TMP11]] to %{{.*}} : !fir.ref<f32>
-!CHECK:          omp.yield
-!CHECK:        }
-!CHECK:        omp.terminator
-!CHECK:      }
-
-  !$omp do collapse(2)
-  do i2 = 1, i1_ub, i2_s
-    do i8 = i16_lb, 100, i4_s
-      x = i2 + i8
-    end do
-  end do
-  !$omp end do
-
-!CHECK:      %[[TMP12:.*]] = arith.constant 1 : i32
-!CHECK:      %[[TMP13:.*]] = fir.convert %{{.*}} : (i8) -> i32
-!CHECK:      %[[TMP14:.*]] = fir.convert %{{.*}} : (i64) -> i32
-!CHECK:      omp.wsloop {
-!CHECK-NEXT:   omp.loop_nest (%[[ARG0:.*]]) : i32 = (%[[TMP12]]) to (%[[TMP13]]) inclusive step (%[[TMP14]]) {
-!CHECK:          %[[ARG0_I16:.*]] = fir.convert %[[ARG0]] : (i32) -> i16
-!CHECK:          fir.store %[[ARG0_I16]] to %[[STORE3:.*]] : !fir.ref<i16>
-!CHECK:          %[[LOAD3:.*]] = fir.load %[[STORE3]] : !fir.ref<i16>
-!CHECK:          %[[TMP16:.*]] = fir.convert %[[LOAD3]] : (i16) -> f32
-!CHECK:          fir.store %[[TMP16]] to %{{.*}} : !fir.ref<f32>
-!CHECK:          omp.yield
-!CHECK:        }
-!CHECK:        omp.terminator
-!CHECK:      }
-
-  !$omp do
-  do i2 = 1, i1_ub, i8_s
-    x = i2
-  end do
-  !$omp end do
-
-!CHECK:      %[[TMP17:.*]] = fir.convert %{{.*}} : (i8) -> i64
-!CHECK:      %[[TMP18:.*]] = fir.convert %{{.*}} : (i16) -> i64
-!CHECK:      %[[TMP19:.*]] = fir.convert %{{.*}} : (i32) -> i64
-!CHECK:      omp.wsloop {
-!CHECK-NEXT:   omp.loop_nest (%[[ARG1:.*]]) : i64 = (%[[TMP17]]) to (%[[TMP18]]) inclusive step (%[[TMP19]]) {
-!CHECK:          %[[ARG1_I128:.*]] = fir.convert %[[ARG1]] : (i64) -> i128
-!CHECK:          fir.store %[[ARG1_I128]] to %[[STORE4:.*]] : !fir.ref<i128>
-!CHECK:          %[[LOAD4:.*]] = fir.load %[[STORE4]] : !fir.ref<i128>
-!CHECK:          %[[TMP21:.*]] = fir.convert %[[LOAD4]] : (i128) -> f32
-!CHECK:          fir.store %[[TMP21]] to %{{.*}} : !fir.ref<f32>
-!CHECK:          omp.yield
-!CHECK:        }
-!CHECK:        omp.terminator
-!CHECK:      }
-
-  !$omp do
-  do i16 = i1_lb, i2_ub, i4_s
-    x = i16
-  end do
-  !$omp end do
-
-end program wsloop_variable
-
-!CHECK-LABEL: func.func @_QPwsloop_variable_sub() {
-!CHECK:         %[[IV2:.*]] = fir.alloca i8 {adapt.valuebyref, pinned}
-!CHECK:         %[[VAL_0:.*]] = fir.alloca i128 {bindc_name = "i16_lb", uniq_name = "_QFwsloop_variable_subEi16_lb"}
-!CHECK:         %[[VAL_1:.*]] = fir.alloca i8 {bindc_name = "i1_ub", uniq_name = "_QFwsloop_variable_subEi1_ub"}
-!CHECK:         %[[VAL_2:.*]] = fir.alloca i16 {bindc_name = "i2", uniq_name = "_QFwsloop_variable_subEi2"}
-!CHECK:         %[[VAL_3:.*]] = fir.alloca i16 {bindc_name = "i2_s", uniq_name = "_QFwsloop_variable_subEi2_s"}
-!CHECK:         %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "i4_s", uniq_name = "_QFwsloop_variable_subEi4_s"}
-!CHECK:         %[[VAL_5:.*]] = fir.alloca i64 {bindc_name = "i8", uniq_name = "_QFwsloop_variable_subEi8"}
-!CHECK:         %[[J1:.*]] = fir.alloca i8 {bindc_name = "j1", uniq_name = "_QFwsloop_variable_subEj1"}
-!CHECK:         %[[VAL_6:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFwsloop_variable_subEx"}
-!CHECK:         %[[VAL_7:.*]] = arith.constant 1 : i32
-!CHECK:         %[[VAL_8:.*]] = fir.load %[[VAL_1]] : !fir.ref<i8>
-!CHECK:         %[[VAL_9:.*]] = fir.load %[[VAL_3]] : !fir.ref<i16>
-!CHECK:         %[[VAL_10:.*]] = fir.convert %[[VAL_8]] : (i8) -> i32
-!CHECK:         %[[VAL_11:.*]] = fir.convert %[[VAL_9]] : (i16) -> i32
-!CHECK:         omp.wsloop {
-!CHECK-NEXT:      omp.loop_nest (%[[ARG0:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
-!CHECK:             %[[ARG0_I16:.*]] = fir.convert %[[ARG0]] : (i32) -> i16
-!CHECK:             fir.store %[[ARG0_I16]] to %[[STORE_IV:.*]] : !fir.ref<i16>
-!CHECK:             %[[VAL_13:.*]] = fir.load %[[VAL_0]] : !fir.ref<i128>
-!CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i128) -> index
-!CHECK:             %[[VAL_15:.*]] = arith.constant 100 : i32
-!CHECK:             %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i32) -> index
-!CHECK:             %[[VAL_17:.*]] = fir.load %[[VAL_4]] : !fir.ref<i32>
-!CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_17]] : (i32) -> index
-!CHECK:             %[[LB:.*]] = fir.convert %[[VAL_14]] : (index) -> i64
-!CHECK:             %[[VAL_19:.*]]:2 = fir.do_loop %[[VAL_20:[^ ]*]] =
-!CHECK-SAME:            %[[VAL_14]] to %[[VAL_16]] step %[[VAL_18]]
-!CHECK-SAME:            iter_args(%[[IV:.*]] = %[[LB]]) -> (index, i64) {
-!CHECK:               fir.store %[[IV]] to %[[VAL_5]] : !fir.ref<i64>
-!CHECK:               %[[LOAD_IV:.*]] = fir.load %[[STORE_IV]] : !fir.ref<i16>
-!CHECK:               %[[VAL_22:.*]] = fir.convert %[[LOAD_IV]] : (i16) -> i64
-!CHECK:               %[[VAL_23:.*]] = fir.load %[[VAL_5]] : !fir.ref<i64>
-!CHECK:               %[[VAL_24:.*]] = arith.addi %[[VAL_22]], %[[VAL_23]] : i64
-!CHECK:               %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (i64) -> f32
-!CHECK:               fir.store %[[VAL_25]] to %[[VAL_6]] : !fir.ref<f32>
-!CHECK:               %[[VAL_26:.*]] = arith.addi %[[VAL_20]], %[[VAL_18]] : index
-!CHECK:               %[[STEPCAST:.*]] = fir.convert %[[VAL_18]] : (index) -> i64
-!CHECK:               %[[IVLOAD:.*]] = fir.load %[[VAL_5]] : !fir.ref<i64>
-!CHECK:               %[[IVINC:.*]] = arith.addi %[[IVLOAD]], %[[STEPCAST]]
-!CHECK:               fir.result %[[VAL_26]], %[[IVINC]] : index, i64
-!CHECK:             }
-!CHECK:             fir.store %[[VAL_19]]#1 to %[[VAL_5]] : !fir.ref<i64>
-!CHECK:             omp.yield
-!CHECK:           }
-!CHECK:           omp.terminator
-!CHECK:         }
-
-subroutine wsloop_variable_sub
-  integer(kind=1) :: i1, i1_ub, j1
-  integer(kind=2) :: i2, i2_s
-  integer(kind=4) :: i4_s
-  integer(kind=8) :: i8
-  integer(kind=16) :: i16_lb
-  real :: x
-
-  !$omp do
-  do i2 = 1, i1_ub, i2_s
-    do i8 = i16_lb, 100, i4_s
-      x = i2 + i8
-    end do
-  end do
-  !$omp end do
-
-!CHECK:         %[[C1:.*]] = arith.constant 1 : i32
-!CHECK:         %[[C10:.*]] = arith.constant 10 : i32
-!CHECK:         %[[C1_2:.*]] = arith.constant 1 : i32
-!CHECK:         omp.wsloop {
-!CHECK-NEXT:      omp.loop_nest (%[[ARG0:.*]]) : i32 = (%[[C1]]) to (%[[C10]]) inclusive step (%[[C1_2]]) {
-!CHECK:             %[[ARG0_I8:.*]] = fir.convert %[[ARG0]] : (i32) -> i8
-!CHECK:             fir.store %[[ARG0_I8]] to %[[IV2]] : !fir.ref<i8>
-!CHECK:             %[[IV2LOAD:.*]] = fir.load %[[IV2]] : !fir.ref<i8>
-!CHECK:             %[[J1LOAD:.*]] = fir.load %[[J1]] : !fir.ref<i8>
-!CHECK:             %[[VAL_27:.*]] = arith.cmpi eq, %[[IV2LOAD]], %[[J1LOAD]] : i8
-!CHECK:             fir.if %[[VAL_27]] {
-!CHECK:             } else {
-!CHECK:             }
-!CHECK:             omp.yield
-!CHECK:           }
-!CHECK:           omp.terminator
-!CHECK:         }
-
-  j1 = 5
-  !$omp do
-  do i1 = 1, 10
-    if (i1 .eq. j1) then
-      print *, "EQ"
-    end if
-  end do
-  !$omp end do
-
-!CHECK:         return
-!CHECK:       }
-
-end
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop.f90 b/flang/test/Lower/OpenMP/FIR/wsloop.f90
deleted file mode 100644
index c9e428abdb44..000000000000
--- a/flang/test/Lower/OpenMP/FIR/wsloop.f90
+++ /dev/null
@@ -1,78 +0,0 @@
-! This test checks lowering of OpenMP DO Directive (Worksharing).
-
-! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
-
-!CHECK-LABEL: func @_QPsimple_loop()
-subroutine simple_loop
-  integer :: i
-  ! CHECK:  omp.parallel
-  !$OMP PARALLEL
-  ! CHECK:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
-  ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
-  ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
-  ! CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-  ! CHECK:      omp.wsloop {
-  ! CHECK-NEXT:   omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-  !$OMP DO
-  do i=1, 9
-  ! CHECK:          fir.store %[[I]] to %[[ALLOCA_IV:.*]] : !fir.ref<i32>
-  ! CHECK:          %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! CHECK:          fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  ! CHECK:          omp.yield
-  ! CHECK:        omp.terminator
-  !$OMP END DO
-  ! CHECK:      omp.terminator
-  !$OMP END PARALLEL
-end subroutine
-
-!CHECK-LABEL: func @_QPsimple_loop_with_step()
-subroutine simple_loop_with_step
-  integer :: i
-  ! CHECK:  omp.parallel
-  !$OMP PARALLEL
-  ! CHECK:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
-  ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
-  ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
-  ! CHECK:      %[[WS_STEP:.*]] = arith.constant 2 : i32
-  ! CHECK:      omp.wsloop {
-  ! CHECK-NEXT:   omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-  ! CHECK:          fir.store %[[I]] to %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! CHECK:          %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-  !$OMP DO
-  do i=1, 9, 2
-  ! CHECK:          fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  ! CHECK:          omp.yield
-  ! CHECK:        omp.terminator
-  !$OMP END DO
-  ! CHECK:      omp.terminator
-  !$OMP END PARALLEL
-end subroutine
-
-!CHECK-LABEL: func @_QPloop_with_schedule_nowait()
-subroutine loop_with_schedule_nowait
-  integer :: i
-  ! CHECK:  omp.parallel
-  !$OMP PARALLEL
-  ! CHECK:      %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
-  ! CHECK:      %[[WS_LB:.*]] = arith.constant 1 : i32
-  ! CHECK:      %[[WS_UB:.*]] = arith.constant 9 : i32
-  ! CHECK:      %[[WS_STEP:.*]] = arith.constant 1 : i32
-  ! CHECK:      omp.wsloop schedule(runtime) nowait {
-  ! CHECK-NEXT:   omp.loop_nest (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]]) {
-  !$OMP DO SCHEDULE(runtime)
-  do i=1, 9
-  ! CHECK:          fir.store %[[I]] to %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! CHECK:          %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! CHECK:          fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    print*, i
-  end do
-  ! CHECK:          omp.yield
-  ! CHECK:        omp.terminator
-  !$OMP END DO NOWAIT
-  ! CHECK:      omp.terminator
-  !$OMP END PARALLEL
-end subroutine
diff --git a/flang/test/Lower/OpenMP/cfg-conversion-omp.private.f90 b/flang/test/Lower/OpenMP/cfg-conversion-omp.private.f90
new file mode 100644
index 000000000000..7f1087a7ebe3
--- /dev/null
+++ b/flang/test/Lower/OpenMP/cfg-conversion-omp.private.f90
@@ -0,0 +1,54 @@
+! Tests that CFG & LLVM conversion is applied to `omp.private` ops.
+
+! RUN: split-file %s %t && cd %t
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN:   -o - test.f90 2>&1 | \
+! RUN: fir-opt --cfg-conversion -o test.cfg-conv.mlir
+! RUN: FileCheck --input-file=test.cfg-conv.mlir %s --check-prefix="CFGConv"
+
+! RUN: fir-opt --convert-hlfir-to-fir --cg-rewrite --fir-to-llvm-ir test.cfg-conv.mlir -o - | \
+! RUN: FileCheck %s --check-prefix="LLVMDialect"
+
+!--- test.f90
+subroutine delayed_privatization_allocatable
+  implicit none
+  integer, allocatable :: var1
+
+!$omp parallel private(var1)
+  var1 = 10
+!$omp end parallel
+end subroutine
+
+! CFGConv-LABEL: omp.private {type = private}
+! CFGConv-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]] alloc {
+
+! CFGConv-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+
+! CFGConv-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_allocatableEvar1"}
+
+! CFGConv-NEXT:   %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CFGConv-NEXT:   %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
+! CFGConv-NEXT:   %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> i64
+! CFGConv-NEXT:   %[[C0:.*]] = arith.constant 0 : i64
+! CFGConv-NEXT:   %[[ALLOC_COND:.*]] = arith.cmpi ne, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
+! CFGConv-NEXT:   cf.cond_br %[[ALLOC_COND]], ^[[ALLOC_MEM_BB:.*]], ^[[ZERO_MEM_BB:.*]]
+! CFGConv-NEXT: ^[[ALLOC_MEM_BB]]:
+! CFGConv-NEXT:   fir.allocmem
+! CFGConv:        cf.br ^[[DECL_BB:.*]]
+! CFGConv:      ^[[ZERO_MEM_BB]]:
+! CFGConv-NEXT:   fir.zero_bits
+! CFGConv:        cf.br ^[[DECL_BB:.*]]
+! CFGConv-NEXT: ^[[DECL_BB]]:
+! CFGConv-NEXT:   hlfir.declare
+! CFGConv-NEXT:   omp.yield
+
+
+! LLVMDialect-LABEL: omp.private {type = private}
+! LLVMDialect-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!llvm.ptr]] alloc {
+
+! LLVMDialect-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+! LLVMDialect:        llvm.alloca
+! LLVMDialect:        llvm.call @malloc
+
+! LLVMDialect-NOT:    hlfir.declare
diff --git a/flang/test/Lower/OpenMP/default-clause-implied-do-fix.f90 b/flang/test/Lower/OpenMP/default-clause-implied-do-fix.f90
new file mode 100644
index 000000000000..25579272a6e0
--- /dev/null
+++ b/flang/test/Lower/OpenMP/default-clause-implied-do-fix.f90
@@ -0,0 +1,11 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+!CHECK: @_QPsb
+subroutine sb(a)
+  integer :: a(:)
+!CHECK: omp.parallel
+  !$omp parallel default(private)
+!CHECK: hlfir.elemental
+    if (any(a/=(/(100,i=1,5)/))) print *, "OK"
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/threadprivate-real-logical-complex-derivedtype.f90 b/flang/test/Lower/OpenMP/threadprivate-real-logical-complex-derivedtype.f90
index 55f806962a60..0a249ff101a0 100644
--- a/flang/test/Lower/OpenMP/threadprivate-real-logical-complex-derivedtype.f90
+++ b/flang/test/Lower/OpenMP/threadprivate-real-logical-complex-derivedtype.f90
@@ -21,6 +21,7 @@ module test
 !CHECK-DAG: fir.global @_QMtestEz : !fir.logical<4> {
 
 contains
+!CHECK-LABEL: func.func @_QMtestPsub
   subroutine sub()
 !CHECK-DAG:  %[[T:.*]] = fir.address_of(@_QMtestEt) : !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>
 !CHECK-DAG:  %[[T_DECL:.*]]:2 = hlfir.declare %[[T]] {uniq_name = "_QMtestEt"} : (!fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>) -> (!fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>, !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>)
diff --git a/flang/tools/flang-driver/CMakeLists.txt b/flang/tools/flang-driver/CMakeLists.txt
index 3ce8b407450d..ce30ecff028d 100644
--- a/flang/tools/flang-driver/CMakeLists.txt
+++ b/flang/tools/flang-driver/CMakeLists.txt
@@ -21,7 +21,6 @@ add_flang_tool(flang-new
   # unable to generate executables.
   FortranRuntime
   FortranDecimal
-  Fortran_main
 )
 
 target_link_libraries(flang-new
diff --git a/flang/unittests/Optimizer/Builder/ComplexTest.cpp b/flang/unittests/Optimizer/Builder/ComplexTest.cpp
index 5364eec904ff..17171512470a 100644
--- a/flang/unittests/Optimizer/Builder/ComplexTest.cpp
+++ b/flang/unittests/Optimizer/Builder/ComplexTest.cpp
@@ -96,6 +96,6 @@ TEST_F(ComplexTest, verifyConvertWithSemantics) {
 
   // Convert complex to integer
   mlir::Value v2 = firBuilder->convertWithSemantics(loc, integerTy1, v1);
-  EXPECT_TRUE(v2.getType().isa<mlir::IntegerType>());
+  EXPECT_TRUE(mlir::isa<mlir::IntegerType>(v2.getType()));
   EXPECT_TRUE(mlir::dyn_cast<fir::ConvertOp>(v2.getDefiningOp()));
 }
diff --git a/flang/unittests/Optimizer/Builder/DoLoopHelperTest.cpp b/flang/unittests/Optimizer/Builder/DoLoopHelperTest.cpp
index 7e7206dbf934..d0a9342914a3 100644
--- a/flang/unittests/Optimizer/Builder/DoLoopHelperTest.cpp
+++ b/flang/unittests/Optimizer/Builder/DoLoopHelperTest.cpp
@@ -34,7 +34,7 @@ public:
 void checkConstantValue(const mlir::Value &value, int64_t v) {
   EXPECT_TRUE(mlir::isa<mlir::arith::ConstantOp>(value.getDefiningOp()));
   auto cstOp = dyn_cast<mlir::arith::ConstantOp>(value.getDefiningOp());
-  auto valueAttr = cstOp.getValue().dyn_cast_or_null<IntegerAttr>();
+  auto valueAttr = dyn_cast_or_null<IntegerAttr>(cstOp.getValue());
   EXPECT_EQ(v, valueAttr.getInt());
 }
 
diff --git a/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp b/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
index b6a1f9c9db8f..e5e5454ee88a 100644
--- a/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
+++ b/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
@@ -54,7 +54,7 @@ static void checkIntegerConstant(mlir::Value value, mlir::Type ty, int64_t v) {
   EXPECT_TRUE(mlir::isa<mlir::arith::ConstantOp>(value.getDefiningOp()));
   auto cstOp = dyn_cast<mlir::arith::ConstantOp>(value.getDefiningOp());
   EXPECT_EQ(ty, cstOp.getType());
-  auto valueAttr = cstOp.getValue().dyn_cast_or_null<IntegerAttr>();
+  auto valueAttr = mlir::dyn_cast_or_null<IntegerAttr>(cstOp.getValue());
   EXPECT_EQ(v, valueAttr.getInt());
 }
 
@@ -151,7 +151,7 @@ TEST_F(FIRBuilderTest, createRealZeroConstant) {
   auto cstOp = dyn_cast<arith::ConstantOp>(cst.getDefiningOp());
   EXPECT_EQ(realTy, cstOp.getType());
   EXPECT_EQ(
-      0u, cstOp.getValue().cast<FloatAttr>().getValue().convertToDouble());
+      0u, mlir::cast<FloatAttr>(cstOp.getValue()).getValue().convertToDouble());
 }
 
 TEST_F(FIRBuilderTest, createBool) {
@@ -164,8 +164,8 @@ TEST_F(FIRBuilderTest, createBool) {
 TEST_F(FIRBuilderTest, getVarLenSeqTy) {
   auto builder = getBuilder();
   auto ty = builder.getVarLenSeqTy(builder.getI64Type());
-  EXPECT_TRUE(ty.isa<fir::SequenceType>());
-  fir::SequenceType seqTy = ty.dyn_cast<fir::SequenceType>();
+  EXPECT_TRUE(mlir::isa<fir::SequenceType>(ty));
+  fir::SequenceType seqTy = mlir::dyn_cast<fir::SequenceType>(ty);
   EXPECT_EQ(1u, seqTy.getDimension());
   EXPECT_TRUE(fir::unwrapSequenceType(ty).isInteger(64));
 }
@@ -216,9 +216,9 @@ TEST_F(FIRBuilderTest, createGlobal2) {
   EXPECT_FALSE(global.getConstant().has_value());
   EXPECT_EQ(i32Type, global.getType());
   EXPECT_TRUE(global.getInitVal().has_value());
-  EXPECT_TRUE(global.getInitVal().value().isa<mlir::IntegerAttr>());
-  EXPECT_EQ(
-      16, global.getInitVal().value().cast<mlir::IntegerAttr>().getValue());
+  EXPECT_TRUE(mlir::isa<mlir::IntegerAttr>(global.getInitVal().value()));
+  EXPECT_EQ(16,
+      mlir::cast<mlir::IntegerAttr>(global.getInitVal().value()).getValue());
   EXPECT_TRUE(global.getLinkName().has_value());
   EXPECT_EQ(
       builder.createLinkOnceLinkage().getValue(), global.getLinkName().value());
@@ -271,12 +271,12 @@ TEST_F(FIRBuilderTest, locationToFilename) {
   auto stringLitOps = global.getRegion().front().getOps<fir::StringLitOp>();
   EXPECT_TRUE(llvm::hasSingleElement(stringLitOps));
   for (auto stringLit : stringLitOps) {
-    EXPECT_EQ(10, stringLit.getSize().cast<mlir::IntegerAttr>().getValue());
-    EXPECT_TRUE(stringLit.getValue().isa<StringAttr>());
+    EXPECT_EQ(
+        10, mlir::cast<mlir::IntegerAttr>(stringLit.getSize()).getValue());
+    EXPECT_TRUE(mlir::isa<StringAttr>(stringLit.getValue()));
     EXPECT_EQ(0,
         strcmp("file1.f90\0",
-            stringLit.getValue()
-                .dyn_cast<StringAttr>()
+            mlir::dyn_cast<StringAttr>(stringLit.getValue())
                 .getValue()
                 .str()
                 .c_str()));
@@ -288,9 +288,9 @@ TEST_F(FIRBuilderTest, createStringLitOp) {
   llvm::StringRef data("mystringlitdata");
   auto loc = builder.getUnknownLoc();
   auto op = builder.createStringLitOp(loc, data);
-  EXPECT_EQ(15, op.getSize().cast<mlir::IntegerAttr>().getValue());
-  EXPECT_TRUE(op.getValue().isa<StringAttr>());
-  EXPECT_EQ(data, op.getValue().dyn_cast<StringAttr>().getValue());
+  EXPECT_EQ(15, mlir::cast<mlir::IntegerAttr>(op.getSize()).getValue());
+  EXPECT_TRUE(mlir::isa<StringAttr>(op.getValue()));
+  EXPECT_EQ(data, mlir::dyn_cast<StringAttr>(op.getValue()).getValue());
 }
 
 TEST_F(FIRBuilderTest, createStringLiteral) {
@@ -318,9 +318,11 @@ TEST_F(FIRBuilderTest, createStringLiteral) {
   auto stringLitOps = global.getRegion().front().getOps<fir::StringLitOp>();
   EXPECT_TRUE(llvm::hasSingleElement(stringLitOps));
   for (auto stringLit : stringLitOps) {
-    EXPECT_EQ(16, stringLit.getSize().cast<mlir::IntegerAttr>().getValue());
-    EXPECT_TRUE(stringLit.getValue().isa<StringAttr>());
-    EXPECT_EQ(strValue, stringLit.getValue().dyn_cast<StringAttr>().getValue());
+    EXPECT_EQ(
+        16, mlir::cast<mlir::IntegerAttr>(stringLit.getSize()).getValue());
+    EXPECT_TRUE(mlir::isa<StringAttr>(stringLit.getValue()));
+    EXPECT_EQ(
+        strValue, mlir::dyn_cast<StringAttr>(stringLit.getValue()).getValue());
   }
 }
 
@@ -344,7 +346,7 @@ TEST_F(FIRBuilderTest, allocateLocal) {
 static void checkShapeOp(mlir::Value shape, mlir::Value c10, mlir::Value c100) {
   EXPECT_TRUE(mlir::isa<fir::ShapeOp>(shape.getDefiningOp()));
   fir::ShapeOp op = dyn_cast<fir::ShapeOp>(shape.getDefiningOp());
-  auto shapeTy = op.getType().dyn_cast<fir::ShapeType>();
+  auto shapeTy = mlir::dyn_cast<fir::ShapeType>(op.getType());
   EXPECT_EQ(2u, shapeTy.getRank());
   EXPECT_EQ(2u, op.getExtents().size());
   EXPECT_EQ(c10, op.getExtents()[0]);
@@ -372,7 +374,7 @@ TEST_F(FIRBuilderTest, genShapeWithExtentsAndShapeShift) {
   auto shape = builder.genShape(loc, shifts, extents);
   EXPECT_TRUE(mlir::isa<fir::ShapeShiftOp>(shape.getDefiningOp()));
   fir::ShapeShiftOp op = dyn_cast<fir::ShapeShiftOp>(shape.getDefiningOp());
-  auto shapeTy = op.getType().dyn_cast<fir::ShapeShiftType>();
+  auto shapeTy = mlir::dyn_cast<fir::ShapeShiftType>(op.getType());
   EXPECT_EQ(2u, shapeTy.getRank());
   EXPECT_EQ(2u, op.getExtents().size());
   EXPECT_EQ(2u, op.getOrigins().size());
@@ -428,7 +430,7 @@ TEST_F(FIRBuilderTest, createZeroValue) {
   auto cst =
       mlir::dyn_cast_or_null<mlir::arith::ConstantOp>(zeroInt.getDefiningOp());
   EXPECT_TRUE(cst);
-  auto intAttr = cst.getValue().dyn_cast<mlir::IntegerAttr>();
+  auto intAttr = mlir::dyn_cast<mlir::IntegerAttr>(cst.getValue());
   EXPECT_TRUE(intAttr && intAttr.getInt() == 0);
 
   mlir::Type f32Ty = mlir::FloatType::getF32(builder.getContext());
@@ -437,7 +439,7 @@ TEST_F(FIRBuilderTest, createZeroValue) {
   auto cst2 = mlir::dyn_cast_or_null<mlir::arith::ConstantOp>(
       zeroFloat.getDefiningOp());
   EXPECT_TRUE(cst2);
-  auto floatAttr = cst2.getValue().dyn_cast<mlir::FloatAttr>();
+  auto floatAttr = mlir::dyn_cast<mlir::FloatAttr>(cst2.getValue());
   EXPECT_TRUE(floatAttr && floatAttr.getValueAsDouble() == 0.);
 
   mlir::Type boolTy = mlir::IntegerType::get(builder.getContext(), 1);
@@ -446,7 +448,7 @@ TEST_F(FIRBuilderTest, createZeroValue) {
   auto cst3 = mlir::dyn_cast_or_null<mlir::arith::ConstantOp>(
       flaseBool.getDefiningOp());
   EXPECT_TRUE(cst3);
-  auto intAttr2 = cst.getValue().dyn_cast<mlir::IntegerAttr>();
+  auto intAttr2 = mlir::dyn_cast<mlir::IntegerAttr>(cst.getValue());
   EXPECT_TRUE(intAttr2 && intAttr2.getInt() == 0);
 }
 
@@ -482,7 +484,7 @@ TEST_F(FIRBuilderTest, getBaseTypeOf) {
     llvm::SmallVector<fir::ExtendedValue, 4> arrays;
     auto extent = builder.create<fir::UndefOp>(loc, builder.getIndexType());
     llvm::SmallVector<mlir::Value> extents(
-        arrayType.dyn_cast<fir::SequenceType>().getDimension(),
+        mlir::dyn_cast<fir::SequenceType>(arrayType).getDimension(),
         extent.getResult());
     arrays.emplace_back(fir::ArrayBoxValue(ptrValArray, extents));
     arrays.emplace_back(fir::BoxValue(boxValArray));
diff --git a/flang/unittests/Optimizer/RTBuilder.cpp b/flang/unittests/Optimizer/RTBuilder.cpp
index 7fff7f71fc3b..d6cf96c4351c 100644
--- a/flang/unittests/Optimizer/RTBuilder.cpp
+++ b/flang/unittests/Optimizer/RTBuilder.cpp
@@ -27,7 +27,7 @@ TEST(RTBuilderTest, ComplexRuntimeInterface) {
   mlir::Type c99_cacosf_signature{
       fir::runtime::RuntimeTableKey<decltype(c99_cacosf)>::getTypeModel()(
           &ctx)};
-  auto c99_cacosf_funcTy = c99_cacosf_signature.cast<mlir::FunctionType>();
+  auto c99_cacosf_funcTy = mlir::cast<mlir::FunctionType>(c99_cacosf_signature);
   EXPECT_EQ(c99_cacosf_funcTy.getNumInputs(), 1u);
   EXPECT_EQ(c99_cacosf_funcTy.getNumResults(), 1u);
   auto cplx_ty = fir::ComplexType::get(&ctx, 4);
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 2977c26646cb..f34cb178e076 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -178,7 +178,7 @@ set(LIBCXX_LIBDIR_SUFFIX "${LLVM_LIBDIR_SUFFIX}" CACHE STRING
 option(LIBCXX_INSTALL_HEADERS "Install the libc++ headers." ON)
 option(LIBCXX_INSTALL_LIBRARY "Install the libc++ library." ON)
 option(LIBCXX_INSTALL_MODULES
-  "Install the libc++ C++20 module source files (experimental)." OFF
+  "Install the libc++ C++20 module source files (experimental)." ON
 )
 cmake_dependent_option(LIBCXX_INSTALL_STATIC_LIBRARY
   "Install the static libc++ library." ON
diff --git a/libcxx/cmake/caches/Generic-cxx20.cmake b/libcxx/cmake/caches/Generic-cxx20.cmake
index 641c131a737b..3c44fdaf0e42 100644
--- a/libcxx/cmake/caches/Generic-cxx20.cmake
+++ b/libcxx/cmake/caches/Generic-cxx20.cmake
@@ -1,3 +1,2 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_TEST_PARAMS "std=c++20" CACHE STRING "")
 set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-cxx23.cmake b/libcxx/cmake/caches/Generic-cxx23.cmake
index f5409e4652e4..bf88abf56ca6 100644
--- a/libcxx/cmake/caches/Generic-cxx23.cmake
+++ b/libcxx/cmake/caches/Generic-cxx23.cmake
@@ -1,3 +1,2 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_TEST_PARAMS "std=c++23" CACHE STRING "")
 set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-cxx26.cmake b/libcxx/cmake/caches/Generic-cxx26.cmake
index 2d9c018a4ff5..6ba9482af578 100644
--- a/libcxx/cmake/caches/Generic-cxx26.cmake
+++ b/libcxx/cmake/caches/Generic-cxx26.cmake
@@ -1,3 +1,2 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_TEST_PARAMS "std=c++26" CACHE STRING "")
 set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake b/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake
index 9542dcdbf778..72263dfd8463 100644
--- a/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake
+++ b/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake
@@ -1,2 +1 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_HARDENING_MODE "extensive" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-exceptions.cmake b/libcxx/cmake/caches/Generic-no-exceptions.cmake
index c68adfc1276b..f0dffef60dba 100644
--- a/libcxx/cmake/caches/Generic-no-exceptions.cmake
+++ b/libcxx/cmake/caches/Generic-no-exceptions.cmake
@@ -1,3 +1,2 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-experimental.cmake b/libcxx/cmake/caches/Generic-no-experimental.cmake
index 62b7d7373d44..f33ed0141899 100644
--- a/libcxx/cmake/caches/Generic-no-experimental.cmake
+++ b/libcxx/cmake/caches/Generic-no-experimental.cmake
@@ -1,3 +1,2 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_TEST_PARAMS "enable_experimental=False" CACHE STRING "")
 set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-filesystem.cmake b/libcxx/cmake/caches/Generic-no-filesystem.cmake
index 01ae7e68f12c..4000f3a3e8ef 100644
--- a/libcxx/cmake/caches/Generic-no-filesystem.cmake
+++ b/libcxx/cmake/caches/Generic-no-filesystem.cmake
@@ -1,2 +1 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-localization.cmake b/libcxx/cmake/caches/Generic-no-localization.cmake
index fc4957b2d53a..79d6b44c7139 100644
--- a/libcxx/cmake/caches/Generic-no-localization.cmake
+++ b/libcxx/cmake/caches/Generic-no-localization.cmake
@@ -1,2 +1 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_LOCALIZATION OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-random_device.cmake b/libcxx/cmake/caches/Generic-no-random_device.cmake
index ddf479add626..e9b4cc60cc80 100644
--- a/libcxx/cmake/caches/Generic-no-random_device.cmake
+++ b/libcxx/cmake/caches/Generic-no-random_device.cmake
@@ -1,2 +1 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_RANDOM_DEVICE OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-threads.cmake b/libcxx/cmake/caches/Generic-no-threads.cmake
index 724fbc466b58..616baef1be7b 100644
--- a/libcxx/cmake/caches/Generic-no-threads.cmake
+++ b/libcxx/cmake/caches/Generic-no-threads.cmake
@@ -1,4 +1,3 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_THREADS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_THREADS OFF CACHE BOOL "")
 set(LIBCXX_ENABLE_MONOTONIC_CLOCK OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-unicode.cmake b/libcxx/cmake/caches/Generic-no-unicode.cmake
index a4cf7dd73772..01160bf21898 100644
--- a/libcxx/cmake/caches/Generic-no-unicode.cmake
+++ b/libcxx/cmake/caches/Generic-no-unicode.cmake
@@ -1,2 +1 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_UNICODE OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-wide-characters.cmake b/libcxx/cmake/caches/Generic-no-wide-characters.cmake
index dc19389bb5ae..728d41086a38 100644
--- a/libcxx/cmake/caches/Generic-no-wide-characters.cmake
+++ b/libcxx/cmake/caches/Generic-no-wide-characters.cmake
@@ -1,2 +1 @@
-set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_WIDE_CHARACTERS OFF CACHE BOOL "")
diff --git a/libcxx/docs/BuildingLibcxx.rst b/libcxx/docs/BuildingLibcxx.rst
index a0a0cdb43397..e425b9dadfe7 100644
--- a/libcxx/docs/BuildingLibcxx.rst
+++ b/libcxx/docs/BuildingLibcxx.rst
@@ -208,7 +208,7 @@ libc++ specific options
 
 .. option:: LIBCXX_INSTALL_MODULES:BOOL
 
-  **Default**: ``OFF``
+  **Default**: ``ON``
 
   Toggle the installation of the experimental libc++ module sources.
 
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index 938ab76c6ecb..ac4fd0ecc122 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -149,3 +149,5 @@ Build System Changes
 
 - The Cmake variable ``LIBCXX_ENABLE_CLANG_TIDY`` has been removed. The build system has been changed
   to automatically detect the presence of ``clang-tidy`` and the required ``Clang`` libraries.
+
+- The CMake options ``LIBCXX_INSTALL_MODULES`` now defaults to ``ON``.
diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv
index 666be319757c..eb99414c48be 100644
--- a/libcxx/docs/Status/Cxx2cIssues.csv
+++ b/libcxx/docs/Status/Cxx2cIssues.csv
@@ -44,7 +44,7 @@
 "`3919 <https://wg21.link/LWG3919>`__","``enumerate_view`` may invoke UB for sized common non-forward underlying ranges","Tokyo March 2024","","","|ranges|"
 "`3950 <https://wg21.link/LWG3950>`__","``std::basic_string_view`` comparison operators are overspecified","Tokyo March 2024","|Complete|","18.0",""
 "`3975 <https://wg21.link/LWG3975>`__","Specializations of ``basic_format_context`` should not be permitted","Tokyo March 2024","|Nothing To Do|","","|format|"
-"`3984 <https://wg21.link/LWG3984>`__","``ranges::to``'s recursion branch may be ill-formed","Tokyo March 2024","","","|ranges|"
+"`3984 <https://wg21.link/LWG3984>`__","``ranges::to``'s recursion branch may be ill-formed","Tokyo March 2024","|Complete|","19.0","|ranges|"
 "`4011 <https://wg21.link/LWG4011>`__","``""Effects: Equivalent to return""`` in ``[span.elem]``","Tokyo March 2024","|Nothing To Do|","",""
 "`4012 <https://wg21.link/LWG4012>`__","``common_view::begin/end`` are missing the ``simple-view`` check","Tokyo March 2024","","","|ranges|"
 "`4013 <https://wg21.link/LWG4013>`__","``lazy_split_view::outer-iterator::value_type`` should not provide default constructor","Tokyo March 2024","","","|ranges|"
diff --git a/libcxx/include/__algorithm/find.h b/libcxx/include/__algorithm/find.h
index d60356873132..7f58dbb13a57 100644
--- a/libcxx/include/__algorithm/find.h
+++ b/libcxx/include/__algorithm/find.h
@@ -43,7 +43,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // generic implementation
 template <class _Iter, class _Sent, class _Tp, class _Proj>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter
-__find_impl(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
+__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
   for (; __first != __last; ++__first)
     if (std::__invoke(__proj, *__first) == __value)
       break;
@@ -57,8 +57,7 @@ template <class _Tp,
           __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
                             sizeof(_Tp) == 1,
                         int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp*
-__find_impl(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
   if (auto __ret = std::__constexpr_memchr(__first, __value, __last - __first))
     return __ret;
   return __last;
@@ -71,8 +70,7 @@ template <class _Tp,
           __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
                             sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t),
                         int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp*
-__find_impl(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
   if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first))
     return __ret;
   return __last;
@@ -89,10 +87,10 @@ template <class _Tp,
                             is_signed<_Tp>::value == is_signed<_Up>::value,
                         int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp*
-__find_impl(_Tp* __first, _Tp* __last, const _Up& __value, _Proj& __proj) {
+__find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj& __proj) {
   if (__value < numeric_limits<_Tp>::min() || __value > numeric_limits<_Tp>::max())
     return __last;
-  return std::__find_impl(__first, __last, _Tp(__value), __proj);
+  return std::__find(__first, __last, _Tp(__value), __proj);
 }
 
 // __bit_iterator implementation
@@ -134,7 +132,7 @@ __find_bool(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)
 
 template <class _Cp, bool _IsConst, class _Tp, class _Proj, __enable_if_t<__is_identity<_Proj>::value, int> = 0>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, _IsConst>
-__find_impl(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, const _Tp& __value, _Proj&) {
+__find(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, const _Tp& __value, _Proj&) {
   if (static_cast<bool>(__value))
     return std::__find_bool<true>(__first, static_cast<typename _Cp::size_type>(__last - __first));
   return std::__find_bool<false>(__first, static_cast<typename _Cp::size_type>(__last - __first));
@@ -150,7 +148,7 @@ template <class _SegmentedIterator,
           class _Proj,
           __enable_if_t<__is_segmented_iterator<_SegmentedIterator>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _SegmentedIterator
-__find_impl(_SegmentedIterator __first, _SegmentedIterator __last, const _Tp& __value, _Proj& __proj) {
+__find(_SegmentedIterator __first, _SegmentedIterator __last, const _Tp& __value, _Proj& __proj) {
   return std::__find_segment_if(std::move(__first), std::move(__last), __find_segment<_Tp>(__value), __proj);
 }
 
@@ -163,7 +161,7 @@ struct __find_segment {
   template <class _InputIterator, class _Proj>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _InputIterator
   operator()(_InputIterator __first, _InputIterator __last, _Proj& __proj) const {
-    return std::__find_impl(__first, __last, __value_, __proj);
+    return std::__find(__first, __last, __value_, __proj);
   }
 };
 
@@ -173,7 +171,7 @@ _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _In
 find(_InputIterator __first, _InputIterator __last, const _Tp& __value) {
   __identity __proj;
   return std::__rewrap_iter(
-      __first, std::__find_impl(std::__unwrap_iter(__first), std::__unwrap_iter(__last), __value, __proj));
+      __first, std::__find(std::__unwrap_iter(__first), std::__unwrap_iter(__last), __value, __proj));
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/ranges_find.h b/libcxx/include/__algorithm/ranges_find.h
index e1383eb4b071..6b0d5efe37ab 100644
--- a/libcxx/include/__algorithm/ranges_find.h
+++ b/libcxx/include/__algorithm/ranges_find.h
@@ -44,9 +44,9 @@ struct __fn {
     if constexpr (forward_iterator<_Iter>) {
       auto [__first_un, __last_un] = std::__unwrap_range(__first, std::move(__last));
       return std::__rewrap_range<_Sent>(
-          std::move(__first), std::__find_impl(std::move(__first_un), std::move(__last_un), __value, __proj));
+          std::move(__first), std::__find(std::move(__first_un), std::move(__last_un), __value, __proj));
     } else {
-      return std::__find_impl(std::move(__first), std::move(__last), __value, __proj);
+      return std::__find(std::move(__first), std::move(__last), __value, __proj);
     }
   }
 
diff --git a/libcxx/include/__format/escaped_output_table.h b/libcxx/include/__format/escaped_output_table.h
index a4c4c366cf24..6aa91c89defa 100644
--- a/libcxx/include/__format/escaped_output_table.h
+++ b/libcxx/include/__format/escaped_output_table.h
@@ -105,1110 +105,751 @@ namespace __escaped_output_table {
 /// table lacks a property, thus having more bits available for the size.
 ///
 /// The data has 2 values:
-/// - bits [0, 10] The size of the range, allowing 2048 elements.
-/// - bits [11, 31] The lower bound code point of the range. The upper bound of
-///   the range is lower bound + size.
-_LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[1077] = {
+/// - bits [0, 13] The size of the range, allowing 16384 elements.
+/// - bits [14, 31] The lower bound code point of the range. The upper bound of
+///   the range is lower bound + size. Note the code expects code units the fit
+///   into 18 bits, instead of the 21 bits needed for the full Unicode range.
+_LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[711] = {
     0x00000020 /* 00000000 - 00000020 [   33] */,
-    0x0003f821 /* 0000007f - 000000a0 [   34] */,
-    0x00056800 /* 000000ad - 000000ad [    1] */,
-    0x001bc001 /* 00000378 - 00000379 [    2] */,
-    0x001c0003 /* 00000380 - 00000383 [    4] */,
-    0x001c5800 /* 0000038b - 0000038b [    1] */,
-    0x001c6800 /* 0000038d - 0000038d [    1] */,
-    0x001d1000 /* 000003a2 - 000003a2 [    1] */,
-    0x00298000 /* 00000530 - 00000530 [    1] */,
-    0x002ab801 /* 00000557 - 00000558 [    2] */,
-    0x002c5801 /* 0000058b - 0000058c [    2] */,
-    0x002c8000 /* 00000590 - 00000590 [    1] */,
-    0x002e4007 /* 000005c8 - 000005cf [    8] */,
-    0x002f5803 /* 000005eb - 000005ee [    4] */,
-    0x002fa810 /* 000005f5 - 00000605 [   17] */,
-    0x0030e000 /* 0000061c - 0000061c [    1] */,
-    0x0036e800 /* 000006dd - 000006dd [    1] */,
-    0x00387001 /* 0000070e - 0000070f [    2] */,
-    0x003a5801 /* 0000074b - 0000074c [    2] */,
-    0x003d900d /* 000007b2 - 000007bf [   14] */,
-    0x003fd801 /* 000007fb - 000007fc [    2] */,
-    0x00417001 /* 0000082e - 0000082f [    2] */,
-    0x0041f800 /* 0000083f - 0000083f [    1] */,
-    0x0042e001 /* 0000085c - 0000085d [    2] */,
-    0x0042f800 /* 0000085f - 0000085f [    1] */,
-    0x00435804 /* 0000086b - 0000086f [    5] */,
-    0x00447808 /* 0000088f - 00000897 [    9] */,
-    0x00471000 /* 000008e2 - 000008e2 [    1] */,
-    0x004c2000 /* 00000984 - 00000984 [    1] */,
-    0x004c6801 /* 0000098d - 0000098e [    2] */,
-    0x004c8801 /* 00000991 - 00000992 [    2] */,
-    0x004d4800 /* 000009a9 - 000009a9 [    1] */,
-    0x004d8800 /* 000009b1 - 000009b1 [    1] */,
-    0x004d9802 /* 000009b3 - 000009b5 [    3] */,
-    0x004dd001 /* 000009ba - 000009bb [    2] */,
-    0x004e2801 /* 000009c5 - 000009c6 [    2] */,
-    0x004e4801 /* 000009c9 - 000009ca [    2] */,
-    0x004e7807 /* 000009cf - 000009d6 [    8] */,
-    0x004ec003 /* 000009d8 - 000009db [    4] */,
-    0x004ef000 /* 000009de - 000009de [    1] */,
-    0x004f2001 /* 000009e4 - 000009e5 [    2] */,
-    0x004ff801 /* 000009ff - 00000a00 [    2] */,
-    0x00502000 /* 00000a04 - 00000a04 [    1] */,
-    0x00505803 /* 00000a0b - 00000a0e [    4] */,
-    0x00508801 /* 00000a11 - 00000a12 [    2] */,
-    0x00514800 /* 00000a29 - 00000a29 [    1] */,
-    0x00518800 /* 00000a31 - 00000a31 [    1] */,
-    0x0051a000 /* 00000a34 - 00000a34 [    1] */,
-    0x0051b800 /* 00000a37 - 00000a37 [    1] */,
-    0x0051d001 /* 00000a3a - 00000a3b [    2] */,
-    0x0051e800 /* 00000a3d - 00000a3d [    1] */,
-    0x00521803 /* 00000a43 - 00000a46 [    4] */,
-    0x00524801 /* 00000a49 - 00000a4a [    2] */,
-    0x00527002 /* 00000a4e - 00000a50 [    3] */,
-    0x00529006 /* 00000a52 - 00000a58 [    7] */,
-    0x0052e800 /* 00000a5d - 00000a5d [    1] */,
-    0x0052f806 /* 00000a5f - 00000a65 [    7] */,
-    0x0053b809 /* 00000a77 - 00000a80 [   10] */,
-    0x00542000 /* 00000a84 - 00000a84 [    1] */,
-    0x00547000 /* 00000a8e - 00000a8e [    1] */,
-    0x00549000 /* 00000a92 - 00000a92 [    1] */,
-    0x00554800 /* 00000aa9 - 00000aa9 [    1] */,
-    0x00558800 /* 00000ab1 - 00000ab1 [    1] */,
-    0x0055a000 /* 00000ab4 - 00000ab4 [    1] */,
-    0x0055d001 /* 00000aba - 00000abb [    2] */,
-    0x00563000 /* 00000ac6 - 00000ac6 [    1] */,
-    0x00565000 /* 00000aca - 00000aca [    1] */,
-    0x00567001 /* 00000ace - 00000acf [    2] */,
-    0x0056880e /* 00000ad1 - 00000adf [   15] */,
-    0x00572001 /* 00000ae4 - 00000ae5 [    2] */,
-    0x00579006 /* 00000af2 - 00000af8 [    7] */,
-    0x00580000 /* 00000b00 - 00000b00 [    1] */,
-    0x00582000 /* 00000b04 - 00000b04 [    1] */,
-    0x00586801 /* 00000b0d - 00000b0e [    2] */,
-    0x00588801 /* 00000b11 - 00000b12 [    2] */,
-    0x00594800 /* 00000b29 - 00000b29 [    1] */,
-    0x00598800 /* 00000b31 - 00000b31 [    1] */,
-    0x0059a000 /* 00000b34 - 00000b34 [    1] */,
-    0x0059d001 /* 00000b3a - 00000b3b [    2] */,
-    0x005a2801 /* 00000b45 - 00000b46 [    2] */,
-    0x005a4801 /* 00000b49 - 00000b4a [    2] */,
-    0x005a7006 /* 00000b4e - 00000b54 [    7] */,
-    0x005ac003 /* 00000b58 - 00000b5b [    4] */,
-    0x005af000 /* 00000b5e - 00000b5e [    1] */,
-    0x005b2001 /* 00000b64 - 00000b65 [    2] */,
-    0x005bc009 /* 00000b78 - 00000b81 [   10] */,
-    0x005c2000 /* 00000b84 - 00000b84 [    1] */,
-    0x005c5802 /* 00000b8b - 00000b8d [    3] */,
-    0x005c8800 /* 00000b91 - 00000b91 [    1] */,
-    0x005cb002 /* 00000b96 - 00000b98 [    3] */,
-    0x005cd800 /* 00000b9b - 00000b9b [    1] */,
-    0x005ce800 /* 00000b9d - 00000b9d [    1] */,
-    0x005d0002 /* 00000ba0 - 00000ba2 [    3] */,
-    0x005d2802 /* 00000ba5 - 00000ba7 [    3] */,
-    0x005d5802 /* 00000bab - 00000bad [    3] */,
-    0x005dd003 /* 00000bba - 00000bbd [    4] */,
-    0x005e1802 /* 00000bc3 - 00000bc5 [    3] */,
-    0x005e4800 /* 00000bc9 - 00000bc9 [    1] */,
-    0x005e7001 /* 00000bce - 00000bcf [    2] */,
-    0x005e8805 /* 00000bd1 - 00000bd6 [    6] */,
-    0x005ec00d /* 00000bd8 - 00000be5 [   14] */,
-    0x005fd804 /* 00000bfb - 00000bff [    5] */,
-    0x00606800 /* 00000c0d - 00000c0d [    1] */,
-    0x00608800 /* 00000c11 - 00000c11 [    1] */,
-    0x00614800 /* 00000c29 - 00000c29 [    1] */,
-    0x0061d001 /* 00000c3a - 00000c3b [    2] */,
-    0x00622800 /* 00000c45 - 00000c45 [    1] */,
-    0x00624800 /* 00000c49 - 00000c49 [    1] */,
-    0x00627006 /* 00000c4e - 00000c54 [    7] */,
-    0x0062b800 /* 00000c57 - 00000c57 [    1] */,
-    0x0062d801 /* 00000c5b - 00000c5c [    2] */,
-    0x0062f001 /* 00000c5e - 00000c5f [    2] */,
-    0x00632001 /* 00000c64 - 00000c65 [    2] */,
-    0x00638006 /* 00000c70 - 00000c76 [    7] */,
-    0x00646800 /* 00000c8d - 00000c8d [    1] */,
-    0x00648800 /* 00000c91 - 00000c91 [    1] */,
-    0x00654800 /* 00000ca9 - 00000ca9 [    1] */,
-    0x0065a000 /* 00000cb4 - 00000cb4 [    1] */,
-    0x0065d001 /* 00000cba - 00000cbb [    2] */,
-    0x00662800 /* 00000cc5 - 00000cc5 [    1] */,
-    0x00664800 /* 00000cc9 - 00000cc9 [    1] */,
-    0x00667006 /* 00000cce - 00000cd4 [    7] */,
-    0x0066b805 /* 00000cd7 - 00000cdc [    6] */,
-    0x0066f800 /* 00000cdf - 00000cdf [    1] */,
-    0x00672001 /* 00000ce4 - 00000ce5 [    2] */,
-    0x00678000 /* 00000cf0 - 00000cf0 [    1] */,
-    0x0067a00b /* 00000cf4 - 00000cff [   12] */,
-    0x00686800 /* 00000d0d - 00000d0d [    1] */,
-    0x00688800 /* 00000d11 - 00000d11 [    1] */,
-    0x006a2800 /* 00000d45 - 00000d45 [    1] */,
-    0x006a4800 /* 00000d49 - 00000d49 [    1] */,
-    0x006a8003 /* 00000d50 - 00000d53 [    4] */,
-    0x006b2001 /* 00000d64 - 00000d65 [    2] */,
-    0x006c0000 /* 00000d80 - 00000d80 [    1] */,
-    0x006c2000 /* 00000d84 - 00000d84 [    1] */,
-    0x006cb802 /* 00000d97 - 00000d99 [    3] */,
-    0x006d9000 /* 00000db2 - 00000db2 [    1] */,
-    0x006de000 /* 00000dbc - 00000dbc [    1] */,
-    0x006df001 /* 00000dbe - 00000dbf [    2] */,
-    0x006e3802 /* 00000dc7 - 00000dc9 [    3] */,
-    0x006e5803 /* 00000dcb - 00000dce [    4] */,
-    0x006ea800 /* 00000dd5 - 00000dd5 [    1] */,
-    0x006eb800 /* 00000dd7 - 00000dd7 [    1] */,
-    0x006f0005 /* 00000de0 - 00000de5 [    6] */,
-    0x006f8001 /* 00000df0 - 00000df1 [    2] */,
-    0x006fa80b /* 00000df5 - 00000e00 [   12] */,
-    0x0071d803 /* 00000e3b - 00000e3e [    4] */,
-    0x0072e024 /* 00000e5c - 00000e80 [   37] */,
-    0x00741800 /* 00000e83 - 00000e83 [    1] */,
-    0x00742800 /* 00000e85 - 00000e85 [    1] */,
-    0x00745800 /* 00000e8b - 00000e8b [    1] */,
-    0x00752000 /* 00000ea4 - 00000ea4 [    1] */,
-    0x00753000 /* 00000ea6 - 00000ea6 [    1] */,
-    0x0075f001 /* 00000ebe - 00000ebf [    2] */,
-    0x00762800 /* 00000ec5 - 00000ec5 [    1] */,
-    0x00763800 /* 00000ec7 - 00000ec7 [    1] */,
-    0x00767800 /* 00000ecf - 00000ecf [    1] */,
-    0x0076d001 /* 00000eda - 00000edb [    2] */,
-    0x0077001f /* 00000ee0 - 00000eff [   32] */,
-    0x007a4000 /* 00000f48 - 00000f48 [    1] */,
-    0x007b6803 /* 00000f6d - 00000f70 [    4] */,
-    0x007cc000 /* 00000f98 - 00000f98 [    1] */,
-    0x007de800 /* 00000fbd - 00000fbd [    1] */,
-    0x007e6800 /* 00000fcd - 00000fcd [    1] */,
-    0x007ed824 /* 00000fdb - 00000fff [   37] */,
-    0x00863000 /* 000010c6 - 000010c6 [    1] */,
-    0x00864004 /* 000010c8 - 000010cc [    5] */,
-    0x00867001 /* 000010ce - 000010cf [    2] */,
-    0x00924800 /* 00001249 - 00001249 [    1] */,
-    0x00927001 /* 0000124e - 0000124f [    2] */,
-    0x0092b800 /* 00001257 - 00001257 [    1] */,
-    0x0092c800 /* 00001259 - 00001259 [    1] */,
-    0x0092f001 /* 0000125e - 0000125f [    2] */,
-    0x00944800 /* 00001289 - 00001289 [    1] */,
-    0x00947001 /* 0000128e - 0000128f [    2] */,
-    0x00958800 /* 000012b1 - 000012b1 [    1] */,
-    0x0095b001 /* 000012b6 - 000012b7 [    2] */,
-    0x0095f800 /* 000012bf - 000012bf [    1] */,
-    0x00960800 /* 000012c1 - 000012c1 [    1] */,
-    0x00963001 /* 000012c6 - 000012c7 [    2] */,
-    0x0096b800 /* 000012d7 - 000012d7 [    1] */,
-    0x00988800 /* 00001311 - 00001311 [    1] */,
-    0x0098b001 /* 00001316 - 00001317 [    2] */,
-    0x009ad801 /* 0000135b - 0000135c [    2] */,
-    0x009be802 /* 0000137d - 0000137f [    3] */,
-    0x009cd005 /* 0000139a - 0000139f [    6] */,
-    0x009fb001 /* 000013f6 - 000013f7 [    2] */,
-    0x009ff001 /* 000013fe - 000013ff [    2] */,
-    0x00b40000 /* 00001680 - 00001680 [    1] */,
-    0x00b4e802 /* 0000169d - 0000169f [    3] */,
-    0x00b7c806 /* 000016f9 - 000016ff [    7] */,
-    0x00b8b008 /* 00001716 - 0000171e [    9] */,
-    0x00b9b808 /* 00001737 - 0000173f [    9] */,
-    0x00baa00b /* 00001754 - 0000175f [   12] */,
-    0x00bb6800 /* 0000176d - 0000176d [    1] */,
-    0x00bb8800 /* 00001771 - 00001771 [    1] */,
-    0x00bba00b /* 00001774 - 0000177f [   12] */,
-    0x00bef001 /* 000017de - 000017df [    2] */,
-    0x00bf5005 /* 000017ea - 000017ef [    6] */,
-    0x00bfd005 /* 000017fa - 000017ff [    6] */,
-    0x00c07000 /* 0000180e - 0000180e [    1] */,
-    0x00c0d005 /* 0000181a - 0000181f [    6] */,
-    0x00c3c806 /* 00001879 - 0000187f [    7] */,
-    0x00c55804 /* 000018ab - 000018af [    5] */,
-    0x00c7b009 /* 000018f6 - 000018ff [   10] */,
-    0x00c8f800 /* 0000191f - 0000191f [    1] */,
-    0x00c96003 /* 0000192c - 0000192f [    4] */,
-    0x00c9e003 /* 0000193c - 0000193f [    4] */,
-    0x00ca0802 /* 00001941 - 00001943 [    3] */,
-    0x00cb7001 /* 0000196e - 0000196f [    2] */,
-    0x00cba80a /* 00001975 - 0000197f [   11] */,
-    0x00cd6003 /* 000019ac - 000019af [    4] */,
-    0x00ce5005 /* 000019ca - 000019cf [    6] */,
-    0x00ced802 /* 000019db - 000019dd [    3] */,
-    0x00d0e001 /* 00001a1c - 00001a1d [    2] */,
-    0x00d2f800 /* 00001a5f - 00001a5f [    1] */,
-    0x00d3e801 /* 00001a7d - 00001a7e [    2] */,
-    0x00d45005 /* 00001a8a - 00001a8f [    6] */,
-    0x00d4d005 /* 00001a9a - 00001a9f [    6] */,
-    0x00d57001 /* 00001aae - 00001aaf [    2] */,
-    0x00d67830 /* 00001acf - 00001aff [   49] */,
-    0x00da6802 /* 00001b4d - 00001b4f [    3] */,
-    0x00dbf800 /* 00001b7f - 00001b7f [    1] */,
-    0x00dfa007 /* 00001bf4 - 00001bfb [    8] */,
-    0x00e1c002 /* 00001c38 - 00001c3a [    3] */,
-    0x00e25002 /* 00001c4a - 00001c4c [    3] */,
-    0x00e44806 /* 00001c89 - 00001c8f [    7] */,
-    0x00e5d801 /* 00001cbb - 00001cbc [    2] */,
-    0x00e64007 /* 00001cc8 - 00001ccf [    8] */,
-    0x00e7d804 /* 00001cfb - 00001cff [    5] */,
-    0x00f8b001 /* 00001f16 - 00001f17 [    2] */,
-    0x00f8f001 /* 00001f1e - 00001f1f [    2] */,
-    0x00fa3001 /* 00001f46 - 00001f47 [    2] */,
-    0x00fa7001 /* 00001f4e - 00001f4f [    2] */,
-    0x00fac000 /* 00001f58 - 00001f58 [    1] */,
-    0x00fad000 /* 00001f5a - 00001f5a [    1] */,
-    0x00fae000 /* 00001f5c - 00001f5c [    1] */,
-    0x00faf000 /* 00001f5e - 00001f5e [    1] */,
-    0x00fbf001 /* 00001f7e - 00001f7f [    2] */,
-    0x00fda800 /* 00001fb5 - 00001fb5 [    1] */,
-    0x00fe2800 /* 00001fc5 - 00001fc5 [    1] */,
-    0x00fea001 /* 00001fd4 - 00001fd5 [    2] */,
-    0x00fee000 /* 00001fdc - 00001fdc [    1] */,
-    0x00ff8001 /* 00001ff0 - 00001ff1 [    2] */,
-    0x00ffa800 /* 00001ff5 - 00001ff5 [    1] */,
-    0x00fff810 /* 00001fff - 0000200f [   17] */,
-    0x01014007 /* 00002028 - 0000202f [    8] */,
-    0x0102f810 /* 0000205f - 0000206f [   17] */,
-    0x01039001 /* 00002072 - 00002073 [    2] */,
-    0x01047800 /* 0000208f - 0000208f [    1] */,
-    0x0104e802 /* 0000209d - 0000209f [    3] */,
-    0x0106080e /* 000020c1 - 000020cf [   15] */,
-    0x0107880e /* 000020f1 - 000020ff [   15] */,
-    0x010c6003 /* 0000218c - 0000218f [    4] */,
-    0x01213818 /* 00002427 - 0000243f [   25] */,
-    0x01225814 /* 0000244b - 0000245f [   21] */,
-    0x015ba001 /* 00002b74 - 00002b75 [    2] */,
-    0x015cb000 /* 00002b96 - 00002b96 [    1] */,
-    0x0167a004 /* 00002cf4 - 00002cf8 [    5] */,
-    0x01693000 /* 00002d26 - 00002d26 [    1] */,
-    0x01694004 /* 00002d28 - 00002d2c [    5] */,
-    0x01697001 /* 00002d2e - 00002d2f [    2] */,
-    0x016b4006 /* 00002d68 - 00002d6e [    7] */,
-    0x016b880d /* 00002d71 - 00002d7e [   14] */,
-    0x016cb808 /* 00002d97 - 00002d9f [    9] */,
-    0x016d3800 /* 00002da7 - 00002da7 [    1] */,
-    0x016d7800 /* 00002daf - 00002daf [    1] */,
-    0x016db800 /* 00002db7 - 00002db7 [    1] */,
-    0x016df800 /* 00002dbf - 00002dbf [    1] */,
-    0x016e3800 /* 00002dc7 - 00002dc7 [    1] */,
-    0x016e7800 /* 00002dcf - 00002dcf [    1] */,
-    0x016eb800 /* 00002dd7 - 00002dd7 [    1] */,
-    0x016ef800 /* 00002ddf - 00002ddf [    1] */,
-    0x0172f021 /* 00002e5e - 00002e7f [   34] */,
-    0x0174d000 /* 00002e9a - 00002e9a [    1] */,
-    0x0177a00b /* 00002ef4 - 00002eff [   12] */,
-    0x017eb019 /* 00002fd6 - 00002fef [   26] */,
-    0x01800000 /* 00003000 - 00003000 [    1] */,
-    0x01820000 /* 00003040 - 00003040 [    1] */,
-    0x0184b801 /* 00003097 - 00003098 [    2] */,
-    0x01880004 /* 00003100 - 00003104 [    5] */,
-    0x01898000 /* 00003130 - 00003130 [    1] */,
-    0x018c7800 /* 0000318f - 0000318f [    1] */,
-    0x018f200a /* 000031e4 - 000031ee [   11] */,
-    0x0190f800 /* 0000321f - 0000321f [    1] */,
-    0x05246802 /* 0000a48d - 0000a48f [    3] */,
-    0x05263808 /* 0000a4c7 - 0000a4cf [    9] */,
-    0x05316013 /* 0000a62c - 0000a63f [   20] */,
-    0x0537c007 /* 0000a6f8 - 0000a6ff [    8] */,
-    0x053e5804 /* 0000a7cb - 0000a7cf [    5] */,
-    0x053e9000 /* 0000a7d2 - 0000a7d2 [    1] */,
-    0x053ea000 /* 0000a7d4 - 0000a7d4 [    1] */,
-    0x053ed017 /* 0000a7da - 0000a7f1 [   24] */,
-    0x05416802 /* 0000a82d - 0000a82f [    3] */,
-    0x0541d005 /* 0000a83a - 0000a83f [    6] */,
-    0x0543c007 /* 0000a878 - 0000a87f [    8] */,
-    0x05463007 /* 0000a8c6 - 0000a8cd [    8] */,
-    0x0546d005 /* 0000a8da - 0000a8df [    6] */,
-    0x054aa00a /* 0000a954 - 0000a95e [   11] */,
-    0x054be802 /* 0000a97d - 0000a97f [    3] */,
-    0x054e7000 /* 0000a9ce - 0000a9ce [    1] */,
-    0x054ed003 /* 0000a9da - 0000a9dd [    4] */,
-    0x054ff800 /* 0000a9ff - 0000a9ff [    1] */,
-    0x0551b808 /* 0000aa37 - 0000aa3f [    9] */,
-    0x05527001 /* 0000aa4e - 0000aa4f [    2] */,
-    0x0552d001 /* 0000aa5a - 0000aa5b [    2] */,
-    0x05561817 /* 0000aac3 - 0000aada [   24] */,
-    0x0557b809 /* 0000aaf7 - 0000ab00 [   10] */,
-    0x05583801 /* 0000ab07 - 0000ab08 [    2] */,
-    0x05587801 /* 0000ab0f - 0000ab10 [    2] */,
-    0x0558b808 /* 0000ab17 - 0000ab1f [    9] */,
-    0x05593800 /* 0000ab27 - 0000ab27 [    1] */,
-    0x05597800 /* 0000ab2f - 0000ab2f [    1] */,
-    0x055b6003 /* 0000ab6c - 0000ab6f [    4] */,
-    0x055f7001 /* 0000abee - 0000abef [    2] */,
-    0x055fd005 /* 0000abfa - 0000abff [    6] */,
-    0x06bd200b /* 0000d7a4 - 0000d7af [   12] */,
-    0x06be3803 /* 0000d7c7 - 0000d7ca [    4] */,
-    0x06bfe7ff /* 0000d7fc - 0000dffb [ 2048] */,
-    0x06ffe7ff /* 0000dffc - 0000e7fb [ 2048] */,
-    0x073fe7ff /* 0000e7fc - 0000effb [ 2048] */,
-    0x077fe7ff /* 0000effc - 0000f7fb [ 2048] */,
-    0x07bfe103 /* 0000f7fc - 0000f8ff [  260] */,
-    0x07d37001 /* 0000fa6e - 0000fa6f [    2] */,
-    0x07d6d025 /* 0000fada - 0000faff [   38] */,
-    0x07d8380b /* 0000fb07 - 0000fb12 [   12] */,
-    0x07d8c004 /* 0000fb18 - 0000fb1c [    5] */,
-    0x07d9b800 /* 0000fb37 - 0000fb37 [    1] */,
-    0x07d9e800 /* 0000fb3d - 0000fb3d [    1] */,
-    0x07d9f800 /* 0000fb3f - 0000fb3f [    1] */,
-    0x07da1000 /* 0000fb42 - 0000fb42 [    1] */,
-    0x07da2800 /* 0000fb45 - 0000fb45 [    1] */,
-    0x07de180f /* 0000fbc3 - 0000fbd2 [   16] */,
-    0x07ec8001 /* 0000fd90 - 0000fd91 [    2] */,
-    0x07ee4006 /* 0000fdc8 - 0000fdce [    7] */,
-    0x07ee801f /* 0000fdd0 - 0000fdef [   32] */,
-    0x07f0d005 /* 0000fe1a - 0000fe1f [    6] */,
-    0x07f29800 /* 0000fe53 - 0000fe53 [    1] */,
-    0x07f33800 /* 0000fe67 - 0000fe67 [    1] */,
-    0x07f36003 /* 0000fe6c - 0000fe6f [    4] */,
-    0x07f3a800 /* 0000fe75 - 0000fe75 [    1] */,
-    0x07f7e803 /* 0000fefd - 0000ff00 [    4] */,
-    0x07fdf802 /* 0000ffbf - 0000ffc1 [    3] */,
-    0x07fe4001 /* 0000ffc8 - 0000ffc9 [    2] */,
-    0x07fe8001 /* 0000ffd0 - 0000ffd1 [    2] */,
-    0x07fec001 /* 0000ffd8 - 0000ffd9 [    2] */,
-    0x07fee802 /* 0000ffdd - 0000ffdf [    3] */,
-    0x07ff3800 /* 0000ffe7 - 0000ffe7 [    1] */,
-    0x07ff780c /* 0000ffef - 0000fffb [   13] */,
-    0x07fff001 /* 0000fffe - 0000ffff [    2] */,
-    0x08006000 /* 0001000c - 0001000c [    1] */,
-    0x08013800 /* 00010027 - 00010027 [    1] */,
-    0x0801d800 /* 0001003b - 0001003b [    1] */,
-    0x0801f000 /* 0001003e - 0001003e [    1] */,
-    0x08027001 /* 0001004e - 0001004f [    2] */,
-    0x0802f021 /* 0001005e - 0001007f [   34] */,
-    0x0807d804 /* 000100fb - 000100ff [    5] */,
-    0x08081803 /* 00010103 - 00010106 [    4] */,
-    0x0809a002 /* 00010134 - 00010136 [    3] */,
-    0x080c7800 /* 0001018f - 0001018f [    1] */,
-    0x080ce802 /* 0001019d - 0001019f [    3] */,
-    0x080d082e /* 000101a1 - 000101cf [   47] */,
-    0x080ff081 /* 000101fe - 0001027f [  130] */,
-    0x0814e802 /* 0001029d - 0001029f [    3] */,
-    0x0816880e /* 000102d1 - 000102df [   15] */,
-    0x0817e003 /* 000102fc - 000102ff [    4] */,
-    0x08192008 /* 00010324 - 0001032c [    9] */,
-    0x081a5804 /* 0001034b - 0001034f [    5] */,
-    0x081bd804 /* 0001037b - 0001037f [    5] */,
-    0x081cf000 /* 0001039e - 0001039e [    1] */,
-    0x081e2003 /* 000103c4 - 000103c7 [    4] */,
-    0x081eb029 /* 000103d6 - 000103ff [   42] */,
-    0x0824f001 /* 0001049e - 0001049f [    2] */,
-    0x08255005 /* 000104aa - 000104af [    6] */,
-    0x0826a003 /* 000104d4 - 000104d7 [    4] */,
-    0x0827e003 /* 000104fc - 000104ff [    4] */,
-    0x08294007 /* 00010528 - 0001052f [    8] */,
-    0x082b200a /* 00010564 - 0001056e [   11] */,
-    0x082bd800 /* 0001057b - 0001057b [    1] */,
-    0x082c5800 /* 0001058b - 0001058b [    1] */,
-    0x082c9800 /* 00010593 - 00010593 [    1] */,
-    0x082cb000 /* 00010596 - 00010596 [    1] */,
-    0x082d1000 /* 000105a2 - 000105a2 [    1] */,
-    0x082d9000 /* 000105b2 - 000105b2 [    1] */,
-    0x082dd000 /* 000105ba - 000105ba [    1] */,
-    0x082de842 /* 000105bd - 000105ff [   67] */,
-    0x0839b808 /* 00010737 - 0001073f [    9] */,
-    0x083ab009 /* 00010756 - 0001075f [   10] */,
-    0x083b4017 /* 00010768 - 0001077f [   24] */,
-    0x083c3000 /* 00010786 - 00010786 [    1] */,
-    0x083d8800 /* 000107b1 - 000107b1 [    1] */,
-    0x083dd844 /* 000107bb - 000107ff [   69] */,
-    0x08403001 /* 00010806 - 00010807 [    2] */,
-    0x08404800 /* 00010809 - 00010809 [    1] */,
-    0x0841b000 /* 00010836 - 00010836 [    1] */,
-    0x0841c802 /* 00010839 - 0001083b [    3] */,
-    0x0841e801 /* 0001083d - 0001083e [    2] */,
-    0x0842b000 /* 00010856 - 00010856 [    1] */,
-    0x0844f807 /* 0001089f - 000108a6 [    8] */,
-    0x0845802f /* 000108b0 - 000108df [   48] */,
-    0x08479800 /* 000108f3 - 000108f3 [    1] */,
-    0x0847b004 /* 000108f6 - 000108fa [    5] */,
-    0x0848e002 /* 0001091c - 0001091e [    3] */,
-    0x0849d004 /* 0001093a - 0001093e [    5] */,
-    0x084a003f /* 00010940 - 0001097f [   64] */,
-    0x084dc003 /* 000109b8 - 000109bb [    4] */,
-    0x084e8001 /* 000109d0 - 000109d1 [    2] */,
-    0x08502000 /* 00010a04 - 00010a04 [    1] */,
-    0x08503804 /* 00010a07 - 00010a0b [    5] */,
-    0x0850a000 /* 00010a14 - 00010a14 [    1] */,
-    0x0850c000 /* 00010a18 - 00010a18 [    1] */,
-    0x0851b001 /* 00010a36 - 00010a37 [    2] */,
-    0x0851d803 /* 00010a3b - 00010a3e [    4] */,
-    0x08524806 /* 00010a49 - 00010a4f [    7] */,
-    0x0852c806 /* 00010a59 - 00010a5f [    7] */,
-    0x0855001f /* 00010aa0 - 00010abf [   32] */,
-    0x08573803 /* 00010ae7 - 00010aea [    4] */,
-    0x0857b808 /* 00010af7 - 00010aff [    9] */,
-    0x0859b002 /* 00010b36 - 00010b38 [    3] */,
-    0x085ab001 /* 00010b56 - 00010b57 [    2] */,
-    0x085b9804 /* 00010b73 - 00010b77 [    5] */,
-    0x085c9006 /* 00010b92 - 00010b98 [    7] */,
-    0x085ce80b /* 00010b9d - 00010ba8 [   12] */,
-    0x085d804f /* 00010bb0 - 00010bff [   80] */,
-    0x08624836 /* 00010c49 - 00010c7f [   55] */,
-    0x0865980c /* 00010cb3 - 00010cbf [   13] */,
-    0x08679806 /* 00010cf3 - 00010cf9 [    7] */,
-    0x08694007 /* 00010d28 - 00010d2f [    8] */,
-    0x0869d125 /* 00010d3a - 00010e5f [  294] */,
-    0x0873f800 /* 00010e7f - 00010e7f [    1] */,
-    0x08755000 /* 00010eaa - 00010eaa [    1] */,
-    0x08757001 /* 00010eae - 00010eaf [    2] */,
-    0x0875904a /* 00010eb2 - 00010efc [   75] */,
-    0x08794007 /* 00010f28 - 00010f2f [    8] */,
-    0x087ad015 /* 00010f5a - 00010f6f [   22] */,
-    0x087c5025 /* 00010f8a - 00010faf [   38] */,
-    0x087e6013 /* 00010fcc - 00010fdf [   20] */,
-    0x087fb808 /* 00010ff7 - 00010fff [    9] */,
-    0x08827003 /* 0001104e - 00011051 [    4] */,
-    0x0883b008 /* 00011076 - 0001107e [    9] */,
-    0x0885e800 /* 000110bd - 000110bd [    1] */,
-    0x0886180c /* 000110c3 - 000110cf [   13] */,
-    0x08874806 /* 000110e9 - 000110ef [    7] */,
-    0x0887d005 /* 000110fa - 000110ff [    6] */,
-    0x0889a800 /* 00011135 - 00011135 [    1] */,
-    0x088a4007 /* 00011148 - 0001114f [    8] */,
-    0x088bb808 /* 00011177 - 0001117f [    9] */,
-    0x088f0000 /* 000111e0 - 000111e0 [    1] */,
-    0x088fa80a /* 000111f5 - 000111ff [   11] */,
-    0x08909000 /* 00011212 - 00011212 [    1] */,
-    0x0892103d /* 00011242 - 0001127f [   62] */,
-    0x08943800 /* 00011287 - 00011287 [    1] */,
-    0x08944800 /* 00011289 - 00011289 [    1] */,
-    0x08947000 /* 0001128e - 0001128e [    1] */,
-    0x0894f000 /* 0001129e - 0001129e [    1] */,
-    0x08955005 /* 000112aa - 000112af [    6] */,
-    0x08975804 /* 000112eb - 000112ef [    5] */,
-    0x0897d005 /* 000112fa - 000112ff [    6] */,
-    0x08982000 /* 00011304 - 00011304 [    1] */,
-    0x08986801 /* 0001130d - 0001130e [    2] */,
-    0x08988801 /* 00011311 - 00011312 [    2] */,
-    0x08994800 /* 00011329 - 00011329 [    1] */,
-    0x08998800 /* 00011331 - 00011331 [    1] */,
-    0x0899a000 /* 00011334 - 00011334 [    1] */,
-    0x0899d000 /* 0001133a - 0001133a [    1] */,
-    0x089a2801 /* 00011345 - 00011346 [    2] */,
-    0x089a4801 /* 00011349 - 0001134a [    2] */,
-    0x089a7001 /* 0001134e - 0001134f [    2] */,
-    0x089a8805 /* 00011351 - 00011356 [    6] */,
-    0x089ac004 /* 00011358 - 0001135c [    5] */,
-    0x089b2001 /* 00011364 - 00011365 [    2] */,
-    0x089b6802 /* 0001136d - 0001136f [    3] */,
-    0x089ba88a /* 00011375 - 000113ff [  139] */,
-    0x08a2e000 /* 0001145c - 0001145c [    1] */,
-    0x08a3101d /* 00011462 - 0001147f [   30] */,
-    0x08a64007 /* 000114c8 - 000114cf [    8] */,
-    0x08a6d0a5 /* 000114da - 0001157f [  166] */,
-    0x08adb001 /* 000115b6 - 000115b7 [    2] */,
-    0x08aef021 /* 000115de - 000115ff [   34] */,
-    0x08b2280a /* 00011645 - 0001164f [   11] */,
-    0x08b2d005 /* 0001165a - 0001165f [    6] */,
-    0x08b36812 /* 0001166d - 0001167f [   19] */,
-    0x08b5d005 /* 000116ba - 000116bf [    6] */,
-    0x08b65035 /* 000116ca - 000116ff [   54] */,
-    0x08b8d801 /* 0001171b - 0001171c [    2] */,
-    0x08b96003 /* 0001172c - 0001172f [    4] */,
-    0x08ba38b8 /* 00011747 - 000117ff [  185] */,
-    0x08c1e063 /* 0001183c - 0001189f [  100] */,
-    0x08c7980b /* 000118f3 - 000118fe [   12] */,
-    0x08c83801 /* 00011907 - 00011908 [    2] */,
-    0x08c85001 /* 0001190a - 0001190b [    2] */,
-    0x08c8a000 /* 00011914 - 00011914 [    1] */,
-    0x08c8b800 /* 00011917 - 00011917 [    1] */,
-    0x08c9b000 /* 00011936 - 00011936 [    1] */,
-    0x08c9c801 /* 00011939 - 0001193a [    2] */,
-    0x08ca3808 /* 00011947 - 0001194f [    9] */,
-    0x08cad045 /* 0001195a - 0001199f [   70] */,
-    0x08cd4001 /* 000119a8 - 000119a9 [    2] */,
-    0x08cec001 /* 000119d8 - 000119d9 [    2] */,
-    0x08cf281a /* 000119e5 - 000119ff [   27] */,
-    0x08d24007 /* 00011a48 - 00011a4f [    8] */,
-    0x08d5180c /* 00011aa3 - 00011aaf [   13] */,
-    0x08d7c806 /* 00011af9 - 00011aff [    7] */,
-    0x08d850f5 /* 00011b0a - 00011bff [  246] */,
-    0x08e04800 /* 00011c09 - 00011c09 [    1] */,
-    0x08e1b800 /* 00011c37 - 00011c37 [    1] */,
-    0x08e23009 /* 00011c46 - 00011c4f [   10] */,
-    0x08e36802 /* 00011c6d - 00011c6f [    3] */,
-    0x08e48001 /* 00011c90 - 00011c91 [    2] */,
-    0x08e54000 /* 00011ca8 - 00011ca8 [    1] */,
-    0x08e5b848 /* 00011cb7 - 00011cff [   73] */,
-    0x08e83800 /* 00011d07 - 00011d07 [    1] */,
-    0x08e85000 /* 00011d0a - 00011d0a [    1] */,
-    0x08e9b802 /* 00011d37 - 00011d39 [    3] */,
-    0x08e9d800 /* 00011d3b - 00011d3b [    1] */,
-    0x08e9f000 /* 00011d3e - 00011d3e [    1] */,
-    0x08ea4007 /* 00011d48 - 00011d4f [    8] */,
-    0x08ead005 /* 00011d5a - 00011d5f [    6] */,
-    0x08eb3000 /* 00011d66 - 00011d66 [    1] */,
-    0x08eb4800 /* 00011d69 - 00011d69 [    1] */,
-    0x08ec7800 /* 00011d8f - 00011d8f [    1] */,
-    0x08ec9000 /* 00011d92 - 00011d92 [    1] */,
-    0x08ecc806 /* 00011d99 - 00011d9f [    7] */,
-    0x08ed5135 /* 00011daa - 00011edf [  310] */,
-    0x08f7c806 /* 00011ef9 - 00011eff [    7] */,
-    0x08f88800 /* 00011f11 - 00011f11 [    1] */,
-    0x08f9d802 /* 00011f3b - 00011f3d [    3] */,
-    0x08fad055 /* 00011f5a - 00011faf [   86] */,
-    0x08fd880e /* 00011fb1 - 00011fbf [   15] */,
-    0x08ff900c /* 00011ff2 - 00011ffe [   13] */,
-    0x091cd065 /* 0001239a - 000123ff [  102] */,
-    0x09237800 /* 0001246f - 0001246f [    1] */,
-    0x0923a80a /* 00012475 - 0001247f [   11] */,
-    0x092a27ff /* 00012544 - 00012d43 [ 2048] */,
-    0x096a224b /* 00012d44 - 00012f8f [  588] */,
-    0x097f980c /* 00012ff3 - 00012fff [   13] */,
-    0x09a1800f /* 00013430 - 0001343f [   16] */,
-    0x09a2b7ff /* 00013456 - 00013c55 [ 2048] */,
-    0x09e2b7a9 /* 00013c56 - 000143ff [ 1962] */,
-    0x0a323fff /* 00014647 - 00014e46 [ 2048] */,
-    0x0a723fff /* 00014e47 - 00015646 [ 2048] */,
-    0x0ab23fff /* 00015647 - 00015e46 [ 2048] */,
-    0x0af23fff /* 00015e47 - 00016646 [ 2048] */,
-    0x0b3239b8 /* 00016647 - 000167ff [  441] */,
-    0x0b51c806 /* 00016a39 - 00016a3f [    7] */,
-    0x0b52f800 /* 00016a5f - 00016a5f [    1] */,
-    0x0b535003 /* 00016a6a - 00016a6d [    4] */,
-    0x0b55f800 /* 00016abf - 00016abf [    1] */,
-    0x0b565005 /* 00016aca - 00016acf [    6] */,
-    0x0b577001 /* 00016aee - 00016aef [    2] */,
-    0x0b57b009 /* 00016af6 - 00016aff [   10] */,
-    0x0b5a3009 /* 00016b46 - 00016b4f [   10] */,
-    0x0b5ad000 /* 00016b5a - 00016b5a [    1] */,
-    0x0b5b1000 /* 00016b62 - 00016b62 [    1] */,
-    0x0b5bc004 /* 00016b78 - 00016b7c [    5] */,
-    0x0b5c82af /* 00016b90 - 00016e3f [  688] */,
-    0x0b74d864 /* 00016e9b - 00016eff [  101] */,
-    0x0b7a5803 /* 00016f4b - 00016f4e [    4] */,
-    0x0b7c4006 /* 00016f88 - 00016f8e [    7] */,
-    0x0b7d003f /* 00016fa0 - 00016fdf [   64] */,
-    0x0b7f280a /* 00016fe5 - 00016fef [   11] */,
-    0x0b7f900d /* 00016ff2 - 00016fff [   14] */,
-    0x0c3fc007 /* 000187f8 - 000187ff [    8] */,
-    0x0c66b029 /* 00018cd6 - 00018cff [   42] */,
-    0x0c684fff /* 00018d09 - 00019508 [ 2048] */,
-    0x0ca84fff /* 00019509 - 00019d08 [ 2048] */,
-    0x0ce84fff /* 00019d09 - 0001a508 [ 2048] */,
-    0x0d284fff /* 0001a509 - 0001ad08 [ 2048] */,
-    0x0d684ae6 /* 0001ad09 - 0001afef [  743] */,
-    0x0d7fa000 /* 0001aff4 - 0001aff4 [    1] */,
-    0x0d7fe000 /* 0001affc - 0001affc [    1] */,
-    0x0d7ff800 /* 0001afff - 0001afff [    1] */,
-    0x0d89180e /* 0001b123 - 0001b131 [   15] */,
-    0x0d89981c /* 0001b133 - 0001b14f [   29] */,
-    0x0d8a9801 /* 0001b153 - 0001b154 [    2] */,
-    0x0d8ab00d /* 0001b156 - 0001b163 [   14] */,
-    0x0d8b4007 /* 0001b168 - 0001b16f [    8] */,
-    0x0d97e7ff /* 0001b2fc - 0001bafb [ 2048] */,
-    0x0dd7e103 /* 0001bafc - 0001bbff [  260] */,
-    0x0de35804 /* 0001bc6b - 0001bc6f [    5] */,
-    0x0de3e802 /* 0001bc7d - 0001bc7f [    3] */,
-    0x0de44806 /* 0001bc89 - 0001bc8f [    7] */,
-    0x0de4d001 /* 0001bc9a - 0001bc9b [    2] */,
-    0x0de507ff /* 0001bca0 - 0001c49f [ 2048] */,
-    0x0e2507ff /* 0001c4a0 - 0001cc9f [ 2048] */,
-    0x0e65025f /* 0001cca0 - 0001ceff [  608] */,
-    0x0e797001 /* 0001cf2e - 0001cf2f [    2] */,
-    0x0e7a3808 /* 0001cf47 - 0001cf4f [    9] */,
-    0x0e7e203b /* 0001cfc4 - 0001cfff [   60] */,
-    0x0e87b009 /* 0001d0f6 - 0001d0ff [   10] */,
-    0x0e893801 /* 0001d127 - 0001d128 [    2] */,
-    0x0e8b9807 /* 0001d173 - 0001d17a [    8] */,
-    0x0e8f5814 /* 0001d1eb - 0001d1ff [   21] */,
-    0x0e923079 /* 0001d246 - 0001d2bf [  122] */,
-    0x0e96a00b /* 0001d2d4 - 0001d2df [   12] */,
-    0x0e97a00b /* 0001d2f4 - 0001d2ff [   12] */,
-    0x0e9ab808 /* 0001d357 - 0001d35f [    9] */,
-    0x0e9bc886 /* 0001d379 - 0001d3ff [  135] */,
-    0x0ea2a800 /* 0001d455 - 0001d455 [    1] */,
-    0x0ea4e800 /* 0001d49d - 0001d49d [    1] */,
-    0x0ea50001 /* 0001d4a0 - 0001d4a1 [    2] */,
-    0x0ea51801 /* 0001d4a3 - 0001d4a4 [    2] */,
-    0x0ea53801 /* 0001d4a7 - 0001d4a8 [    2] */,
-    0x0ea56800 /* 0001d4ad - 0001d4ad [    1] */,
-    0x0ea5d000 /* 0001d4ba - 0001d4ba [    1] */,
-    0x0ea5e000 /* 0001d4bc - 0001d4bc [    1] */,
-    0x0ea62000 /* 0001d4c4 - 0001d4c4 [    1] */,
-    0x0ea83000 /* 0001d506 - 0001d506 [    1] */,
-    0x0ea85801 /* 0001d50b - 0001d50c [    2] */,
-    0x0ea8a800 /* 0001d515 - 0001d515 [    1] */,
-    0x0ea8e800 /* 0001d51d - 0001d51d [    1] */,
-    0x0ea9d000 /* 0001d53a - 0001d53a [    1] */,
-    0x0ea9f800 /* 0001d53f - 0001d53f [    1] */,
-    0x0eaa2800 /* 0001d545 - 0001d545 [    1] */,
-    0x0eaa3802 /* 0001d547 - 0001d549 [    3] */,
-    0x0eaa8800 /* 0001d551 - 0001d551 [    1] */,
-    0x0eb53001 /* 0001d6a6 - 0001d6a7 [    2] */,
-    0x0ebe6001 /* 0001d7cc - 0001d7cd [    2] */,
-    0x0ed4600e /* 0001da8c - 0001da9a [   15] */,
-    0x0ed50000 /* 0001daa0 - 0001daa0 [    1] */,
-    0x0ed5844f /* 0001dab0 - 0001deff [ 1104] */,
-    0x0ef8f805 /* 0001df1f - 0001df24 [    6] */,
-    0x0ef958d4 /* 0001df2b - 0001dfff [  213] */,
-    0x0f003800 /* 0001e007 - 0001e007 [    1] */,
-    0x0f00c801 /* 0001e019 - 0001e01a [    2] */,
-    0x0f011000 /* 0001e022 - 0001e022 [    1] */,
-    0x0f012800 /* 0001e025 - 0001e025 [    1] */,
-    0x0f015804 /* 0001e02b - 0001e02f [    5] */,
-    0x0f037020 /* 0001e06e - 0001e08e [   33] */,
-    0x0f04806f /* 0001e090 - 0001e0ff [  112] */,
-    0x0f096802 /* 0001e12d - 0001e12f [    3] */,
-    0x0f09f001 /* 0001e13e - 0001e13f [    2] */,
-    0x0f0a5003 /* 0001e14a - 0001e14d [    4] */,
-    0x0f0a813f /* 0001e150 - 0001e28f [  320] */,
-    0x0f157810 /* 0001e2af - 0001e2bf [   17] */,
-    0x0f17d004 /* 0001e2fa - 0001e2fe [    5] */,
-    0x0f1801cf /* 0001e300 - 0001e4cf [  464] */,
-    0x0f27d2e5 /* 0001e4fa - 0001e7df [  742] */,
-    0x0f3f3800 /* 0001e7e7 - 0001e7e7 [    1] */,
-    0x0f3f6000 /* 0001e7ec - 0001e7ec [    1] */,
-    0x0f3f7800 /* 0001e7ef - 0001e7ef [    1] */,
-    0x0f3ff800 /* 0001e7ff - 0001e7ff [    1] */,
-    0x0f462801 /* 0001e8c5 - 0001e8c6 [    2] */,
-    0x0f46b828 /* 0001e8d7 - 0001e8ff [   41] */,
-    0x0f4a6003 /* 0001e94c - 0001e94f [    4] */,
-    0x0f4ad003 /* 0001e95a - 0001e95d [    4] */,
-    0x0f4b0310 /* 0001e960 - 0001ec70 [  785] */,
-    0x0f65a84b /* 0001ecb5 - 0001ed00 [   76] */,
-    0x0f69f0c1 /* 0001ed3e - 0001edff [  194] */,
-    0x0f702000 /* 0001ee04 - 0001ee04 [    1] */,
-    0x0f710000 /* 0001ee20 - 0001ee20 [    1] */,
-    0x0f711800 /* 0001ee23 - 0001ee23 [    1] */,
-    0x0f712801 /* 0001ee25 - 0001ee26 [    2] */,
-    0x0f714000 /* 0001ee28 - 0001ee28 [    1] */,
-    0x0f719800 /* 0001ee33 - 0001ee33 [    1] */,
-    0x0f71c000 /* 0001ee38 - 0001ee38 [    1] */,
-    0x0f71d000 /* 0001ee3a - 0001ee3a [    1] */,
-    0x0f71e005 /* 0001ee3c - 0001ee41 [    6] */,
-    0x0f721803 /* 0001ee43 - 0001ee46 [    4] */,
-    0x0f724000 /* 0001ee48 - 0001ee48 [    1] */,
-    0x0f725000 /* 0001ee4a - 0001ee4a [    1] */,
-    0x0f726000 /* 0001ee4c - 0001ee4c [    1] */,
-    0x0f728000 /* 0001ee50 - 0001ee50 [    1] */,
-    0x0f729800 /* 0001ee53 - 0001ee53 [    1] */,
-    0x0f72a801 /* 0001ee55 - 0001ee56 [    2] */,
-    0x0f72c000 /* 0001ee58 - 0001ee58 [    1] */,
-    0x0f72d000 /* 0001ee5a - 0001ee5a [    1] */,
-    0x0f72e000 /* 0001ee5c - 0001ee5c [    1] */,
-    0x0f72f000 /* 0001ee5e - 0001ee5e [    1] */,
-    0x0f730000 /* 0001ee60 - 0001ee60 [    1] */,
-    0x0f731800 /* 0001ee63 - 0001ee63 [    1] */,
-    0x0f732801 /* 0001ee65 - 0001ee66 [    2] */,
-    0x0f735800 /* 0001ee6b - 0001ee6b [    1] */,
-    0x0f739800 /* 0001ee73 - 0001ee73 [    1] */,
-    0x0f73c000 /* 0001ee78 - 0001ee78 [    1] */,
-    0x0f73e800 /* 0001ee7d - 0001ee7d [    1] */,
-    0x0f73f800 /* 0001ee7f - 0001ee7f [    1] */,
-    0x0f745000 /* 0001ee8a - 0001ee8a [    1] */,
-    0x0f74e004 /* 0001ee9c - 0001eea0 [    5] */,
-    0x0f752000 /* 0001eea4 - 0001eea4 [    1] */,
-    0x0f755000 /* 0001eeaa - 0001eeaa [    1] */,
-    0x0f75e033 /* 0001eebc - 0001eeef [   52] */,
-    0x0f77910d /* 0001eef2 - 0001efff [  270] */,
-    0x0f816003 /* 0001f02c - 0001f02f [    4] */,
-    0x0f84a00b /* 0001f094 - 0001f09f [   12] */,
-    0x0f857801 /* 0001f0af - 0001f0b0 [    2] */,
-    0x0f860000 /* 0001f0c0 - 0001f0c0 [    1] */,
-    0x0f868000 /* 0001f0d0 - 0001f0d0 [    1] */,
-    0x0f87b009 /* 0001f0f6 - 0001f0ff [   10] */,
-    0x0f8d7037 /* 0001f1ae - 0001f1e5 [   56] */,
-    0x0f90180c /* 0001f203 - 0001f20f [   13] */,
-    0x0f91e003 /* 0001f23c - 0001f23f [    4] */,
-    0x0f924806 /* 0001f249 - 0001f24f [    7] */,
-    0x0f92900d /* 0001f252 - 0001f25f [   14] */,
-    0x0f933099 /* 0001f266 - 0001f2ff [  154] */,
-    0x0fb6c003 /* 0001f6d8 - 0001f6db [    4] */,
-    0x0fb76802 /* 0001f6ed - 0001f6ef [    3] */,
-    0x0fb7e802 /* 0001f6fd - 0001f6ff [    3] */,
-    0x0fbbb803 /* 0001f777 - 0001f77a [    4] */,
-    0x0fbed005 /* 0001f7da - 0001f7df [    6] */,
-    0x0fbf6003 /* 0001f7ec - 0001f7ef [    4] */,
-    0x0fbf880e /* 0001f7f1 - 0001f7ff [   15] */,
-    0x0fc06003 /* 0001f80c - 0001f80f [    4] */,
-    0x0fc24007 /* 0001f848 - 0001f84f [    8] */,
-    0x0fc2d005 /* 0001f85a - 0001f85f [    6] */,
-    0x0fc44007 /* 0001f888 - 0001f88f [    8] */,
-    0x0fc57001 /* 0001f8ae - 0001f8af [    2] */,
-    0x0fc5904d /* 0001f8b2 - 0001f8ff [   78] */,
-    0x0fd2a00b /* 0001fa54 - 0001fa5f [   12] */,
-    0x0fd37001 /* 0001fa6e - 0001fa6f [    2] */,
-    0x0fd3e802 /* 0001fa7d - 0001fa7f [    3] */,
-    0x0fd44806 /* 0001fa89 - 0001fa8f [    7] */,
-    0x0fd5f000 /* 0001fabe - 0001fabe [    1] */,
-    0x0fd63007 /* 0001fac6 - 0001facd [    8] */,
-    0x0fd6e003 /* 0001fadc - 0001fadf [    4] */,
-    0x0fd74806 /* 0001fae9 - 0001faef [    7] */,
-    0x0fd7c806 /* 0001faf9 - 0001faff [    7] */,
-    0x0fdc9800 /* 0001fb93 - 0001fb93 [    1] */,
-    0x0fde5824 /* 0001fbcb - 0001fbef [   37] */,
-    0x0fdfd405 /* 0001fbfa - 0001ffff [ 1030] */,
-    0x1537001f /* 0002a6e0 - 0002a6ff [   32] */,
-    0x15b9d005 /* 0002b73a - 0002b73f [    6] */,
-    0x15c0f001 /* 0002b81e - 0002b81f [    2] */,
-    0x1675100d /* 0002cea2 - 0002ceaf [   14] */,
-    0x175f080e /* 0002ebe1 - 0002ebef [   15] */,
-    0x1772f7ff /* 0002ee5e - 0002f65d [ 2048] */,
-    0x17b2f1a1 /* 0002f65e - 0002f7ff [  418] */,
-    0x17d0f5e1 /* 0002fa1e - 0002ffff [ 1506] */,
-    0x189a5804 /* 0003134b - 0003134f [    5] */,
-    0x191d87ff /* 000323b0 - 00032baf [ 2048] */,
-    0x195d87ff /* 00032bb0 - 000333af [ 2048] */,
-    0x199d87ff /* 000333b0 - 00033baf [ 2048] */,
-    0x19dd87ff /* 00033bb0 - 000343af [ 2048] */,
-    0x1a1d87ff /* 000343b0 - 00034baf [ 2048] */,
-    0x1a5d87ff /* 00034bb0 - 000353af [ 2048] */,
-    0x1a9d87ff /* 000353b0 - 00035baf [ 2048] */,
-    0x1add87ff /* 00035bb0 - 000363af [ 2048] */,
-    0x1b1d87ff /* 000363b0 - 00036baf [ 2048] */,
-    0x1b5d87ff /* 00036bb0 - 000373af [ 2048] */,
-    0x1b9d87ff /* 000373b0 - 00037baf [ 2048] */,
-    0x1bdd87ff /* 00037bb0 - 000383af [ 2048] */,
-    0x1c1d87ff /* 000383b0 - 00038baf [ 2048] */,
-    0x1c5d87ff /* 00038bb0 - 000393af [ 2048] */,
-    0x1c9d87ff /* 000393b0 - 00039baf [ 2048] */,
-    0x1cdd87ff /* 00039bb0 - 0003a3af [ 2048] */,
-    0x1d1d87ff /* 0003a3b0 - 0003abaf [ 2048] */,
-    0x1d5d87ff /* 0003abb0 - 0003b3af [ 2048] */,
-    0x1d9d87ff /* 0003b3b0 - 0003bbaf [ 2048] */,
-    0x1ddd87ff /* 0003bbb0 - 0003c3af [ 2048] */,
-    0x1e1d87ff /* 0003c3b0 - 0003cbaf [ 2048] */,
-    0x1e5d87ff /* 0003cbb0 - 0003d3af [ 2048] */,
-    0x1e9d87ff /* 0003d3b0 - 0003dbaf [ 2048] */,
-    0x1edd87ff /* 0003dbb0 - 0003e3af [ 2048] */,
-    0x1f1d87ff /* 0003e3b0 - 0003ebaf [ 2048] */,
-    0x1f5d87ff /* 0003ebb0 - 0003f3af [ 2048] */,
-    0x1f9d87ff /* 0003f3b0 - 0003fbaf [ 2048] */,
-    0x1fdd87ff /* 0003fbb0 - 000403af [ 2048] */,
-    0x201d87ff /* 000403b0 - 00040baf [ 2048] */,
-    0x205d87ff /* 00040bb0 - 000413af [ 2048] */,
-    0x209d87ff /* 000413b0 - 00041baf [ 2048] */,
-    0x20dd87ff /* 00041bb0 - 000423af [ 2048] */,
-    0x211d87ff /* 000423b0 - 00042baf [ 2048] */,
-    0x215d87ff /* 00042bb0 - 000433af [ 2048] */,
-    0x219d87ff /* 000433b0 - 00043baf [ 2048] */,
-    0x21dd87ff /* 00043bb0 - 000443af [ 2048] */,
-    0x221d87ff /* 000443b0 - 00044baf [ 2048] */,
-    0x225d87ff /* 00044bb0 - 000453af [ 2048] */,
-    0x229d87ff /* 000453b0 - 00045baf [ 2048] */,
-    0x22dd87ff /* 00045bb0 - 000463af [ 2048] */,
-    0x231d87ff /* 000463b0 - 00046baf [ 2048] */,
-    0x235d87ff /* 00046bb0 - 000473af [ 2048] */,
-    0x239d87ff /* 000473b0 - 00047baf [ 2048] */,
-    0x23dd87ff /* 00047bb0 - 000483af [ 2048] */,
-    0x241d87ff /* 000483b0 - 00048baf [ 2048] */,
-    0x245d87ff /* 00048bb0 - 000493af [ 2048] */,
-    0x249d87ff /* 000493b0 - 00049baf [ 2048] */,
-    0x24dd87ff /* 00049bb0 - 0004a3af [ 2048] */,
-    0x251d87ff /* 0004a3b0 - 0004abaf [ 2048] */,
-    0x255d87ff /* 0004abb0 - 0004b3af [ 2048] */,
-    0x259d87ff /* 0004b3b0 - 0004bbaf [ 2048] */,
-    0x25dd87ff /* 0004bbb0 - 0004c3af [ 2048] */,
-    0x261d87ff /* 0004c3b0 - 0004cbaf [ 2048] */,
-    0x265d87ff /* 0004cbb0 - 0004d3af [ 2048] */,
-    0x269d87ff /* 0004d3b0 - 0004dbaf [ 2048] */,
-    0x26dd87ff /* 0004dbb0 - 0004e3af [ 2048] */,
-    0x271d87ff /* 0004e3b0 - 0004ebaf [ 2048] */,
-    0x275d87ff /* 0004ebb0 - 0004f3af [ 2048] */,
-    0x279d87ff /* 0004f3b0 - 0004fbaf [ 2048] */,
-    0x27dd87ff /* 0004fbb0 - 000503af [ 2048] */,
-    0x281d87ff /* 000503b0 - 00050baf [ 2048] */,
-    0x285d87ff /* 00050bb0 - 000513af [ 2048] */,
-    0x289d87ff /* 000513b0 - 00051baf [ 2048] */,
-    0x28dd87ff /* 00051bb0 - 000523af [ 2048] */,
-    0x291d87ff /* 000523b0 - 00052baf [ 2048] */,
-    0x295d87ff /* 00052bb0 - 000533af [ 2048] */,
-    0x299d87ff /* 000533b0 - 00053baf [ 2048] */,
-    0x29dd87ff /* 00053bb0 - 000543af [ 2048] */,
-    0x2a1d87ff /* 000543b0 - 00054baf [ 2048] */,
-    0x2a5d87ff /* 00054bb0 - 000553af [ 2048] */,
-    0x2a9d87ff /* 000553b0 - 00055baf [ 2048] */,
-    0x2add87ff /* 00055bb0 - 000563af [ 2048] */,
-    0x2b1d87ff /* 000563b0 - 00056baf [ 2048] */,
-    0x2b5d87ff /* 00056bb0 - 000573af [ 2048] */,
-    0x2b9d87ff /* 000573b0 - 00057baf [ 2048] */,
-    0x2bdd87ff /* 00057bb0 - 000583af [ 2048] */,
-    0x2c1d87ff /* 000583b0 - 00058baf [ 2048] */,
-    0x2c5d87ff /* 00058bb0 - 000593af [ 2048] */,
-    0x2c9d87ff /* 000593b0 - 00059baf [ 2048] */,
-    0x2cdd87ff /* 00059bb0 - 0005a3af [ 2048] */,
-    0x2d1d87ff /* 0005a3b0 - 0005abaf [ 2048] */,
-    0x2d5d87ff /* 0005abb0 - 0005b3af [ 2048] */,
-    0x2d9d87ff /* 0005b3b0 - 0005bbaf [ 2048] */,
-    0x2ddd87ff /* 0005bbb0 - 0005c3af [ 2048] */,
-    0x2e1d87ff /* 0005c3b0 - 0005cbaf [ 2048] */,
-    0x2e5d87ff /* 0005cbb0 - 0005d3af [ 2048] */,
-    0x2e9d87ff /* 0005d3b0 - 0005dbaf [ 2048] */,
-    0x2edd87ff /* 0005dbb0 - 0005e3af [ 2048] */,
-    0x2f1d87ff /* 0005e3b0 - 0005ebaf [ 2048] */,
-    0x2f5d87ff /* 0005ebb0 - 0005f3af [ 2048] */,
-    0x2f9d87ff /* 0005f3b0 - 0005fbaf [ 2048] */,
-    0x2fdd87ff /* 0005fbb0 - 000603af [ 2048] */,
-    0x301d87ff /* 000603b0 - 00060baf [ 2048] */,
-    0x305d87ff /* 00060bb0 - 000613af [ 2048] */,
-    0x309d87ff /* 000613b0 - 00061baf [ 2048] */,
-    0x30dd87ff /* 00061bb0 - 000623af [ 2048] */,
-    0x311d87ff /* 000623b0 - 00062baf [ 2048] */,
-    0x315d87ff /* 00062bb0 - 000633af [ 2048] */,
-    0x319d87ff /* 000633b0 - 00063baf [ 2048] */,
-    0x31dd87ff /* 00063bb0 - 000643af [ 2048] */,
-    0x321d87ff /* 000643b0 - 00064baf [ 2048] */,
-    0x325d87ff /* 00064bb0 - 000653af [ 2048] */,
-    0x329d87ff /* 000653b0 - 00065baf [ 2048] */,
-    0x32dd87ff /* 00065bb0 - 000663af [ 2048] */,
-    0x331d87ff /* 000663b0 - 00066baf [ 2048] */,
-    0x335d87ff /* 00066bb0 - 000673af [ 2048] */,
-    0x339d87ff /* 000673b0 - 00067baf [ 2048] */,
-    0x33dd87ff /* 00067bb0 - 000683af [ 2048] */,
-    0x341d87ff /* 000683b0 - 00068baf [ 2048] */,
-    0x345d87ff /* 00068bb0 - 000693af [ 2048] */,
-    0x349d87ff /* 000693b0 - 00069baf [ 2048] */,
-    0x34dd87ff /* 00069bb0 - 0006a3af [ 2048] */,
-    0x351d87ff /* 0006a3b0 - 0006abaf [ 2048] */,
-    0x355d87ff /* 0006abb0 - 0006b3af [ 2048] */,
-    0x359d87ff /* 0006b3b0 - 0006bbaf [ 2048] */,
-    0x35dd87ff /* 0006bbb0 - 0006c3af [ 2048] */,
-    0x361d87ff /* 0006c3b0 - 0006cbaf [ 2048] */,
-    0x365d87ff /* 0006cbb0 - 0006d3af [ 2048] */,
-    0x369d87ff /* 0006d3b0 - 0006dbaf [ 2048] */,
-    0x36dd87ff /* 0006dbb0 - 0006e3af [ 2048] */,
-    0x371d87ff /* 0006e3b0 - 0006ebaf [ 2048] */,
-    0x375d87ff /* 0006ebb0 - 0006f3af [ 2048] */,
-    0x379d87ff /* 0006f3b0 - 0006fbaf [ 2048] */,
-    0x37dd87ff /* 0006fbb0 - 000703af [ 2048] */,
-    0x381d87ff /* 000703b0 - 00070baf [ 2048] */,
-    0x385d87ff /* 00070bb0 - 000713af [ 2048] */,
-    0x389d87ff /* 000713b0 - 00071baf [ 2048] */,
-    0x38dd87ff /* 00071bb0 - 000723af [ 2048] */,
-    0x391d87ff /* 000723b0 - 00072baf [ 2048] */,
-    0x395d87ff /* 00072bb0 - 000733af [ 2048] */,
-    0x399d87ff /* 000733b0 - 00073baf [ 2048] */,
-    0x39dd87ff /* 00073bb0 - 000743af [ 2048] */,
-    0x3a1d87ff /* 000743b0 - 00074baf [ 2048] */,
-    0x3a5d87ff /* 00074bb0 - 000753af [ 2048] */,
-    0x3a9d87ff /* 000753b0 - 00075baf [ 2048] */,
-    0x3add87ff /* 00075bb0 - 000763af [ 2048] */,
-    0x3b1d87ff /* 000763b0 - 00076baf [ 2048] */,
-    0x3b5d87ff /* 00076bb0 - 000773af [ 2048] */,
-    0x3b9d87ff /* 000773b0 - 00077baf [ 2048] */,
-    0x3bdd87ff /* 00077bb0 - 000783af [ 2048] */,
-    0x3c1d87ff /* 000783b0 - 00078baf [ 2048] */,
-    0x3c5d87ff /* 00078bb0 - 000793af [ 2048] */,
-    0x3c9d87ff /* 000793b0 - 00079baf [ 2048] */,
-    0x3cdd87ff /* 00079bb0 - 0007a3af [ 2048] */,
-    0x3d1d87ff /* 0007a3b0 - 0007abaf [ 2048] */,
-    0x3d5d87ff /* 0007abb0 - 0007b3af [ 2048] */,
-    0x3d9d87ff /* 0007b3b0 - 0007bbaf [ 2048] */,
-    0x3ddd87ff /* 0007bbb0 - 0007c3af [ 2048] */,
-    0x3e1d87ff /* 0007c3b0 - 0007cbaf [ 2048] */,
-    0x3e5d87ff /* 0007cbb0 - 0007d3af [ 2048] */,
-    0x3e9d87ff /* 0007d3b0 - 0007dbaf [ 2048] */,
-    0x3edd87ff /* 0007dbb0 - 0007e3af [ 2048] */,
-    0x3f1d87ff /* 0007e3b0 - 0007ebaf [ 2048] */,
-    0x3f5d87ff /* 0007ebb0 - 0007f3af [ 2048] */,
-    0x3f9d87ff /* 0007f3b0 - 0007fbaf [ 2048] */,
-    0x3fdd87ff /* 0007fbb0 - 000803af [ 2048] */,
-    0x401d87ff /* 000803b0 - 00080baf [ 2048] */,
-    0x405d87ff /* 00080bb0 - 000813af [ 2048] */,
-    0x409d87ff /* 000813b0 - 00081baf [ 2048] */,
-    0x40dd87ff /* 00081bb0 - 000823af [ 2048] */,
-    0x411d87ff /* 000823b0 - 00082baf [ 2048] */,
-    0x415d87ff /* 00082bb0 - 000833af [ 2048] */,
-    0x419d87ff /* 000833b0 - 00083baf [ 2048] */,
-    0x41dd87ff /* 00083bb0 - 000843af [ 2048] */,
-    0x421d87ff /* 000843b0 - 00084baf [ 2048] */,
-    0x425d87ff /* 00084bb0 - 000853af [ 2048] */,
-    0x429d87ff /* 000853b0 - 00085baf [ 2048] */,
-    0x42dd87ff /* 00085bb0 - 000863af [ 2048] */,
-    0x431d87ff /* 000863b0 - 00086baf [ 2048] */,
-    0x435d87ff /* 00086bb0 - 000873af [ 2048] */,
-    0x439d87ff /* 000873b0 - 00087baf [ 2048] */,
-    0x43dd87ff /* 00087bb0 - 000883af [ 2048] */,
-    0x441d87ff /* 000883b0 - 00088baf [ 2048] */,
-    0x445d87ff /* 00088bb0 - 000893af [ 2048] */,
-    0x449d87ff /* 000893b0 - 00089baf [ 2048] */,
-    0x44dd87ff /* 00089bb0 - 0008a3af [ 2048] */,
-    0x451d87ff /* 0008a3b0 - 0008abaf [ 2048] */,
-    0x455d87ff /* 0008abb0 - 0008b3af [ 2048] */,
-    0x459d87ff /* 0008b3b0 - 0008bbaf [ 2048] */,
-    0x45dd87ff /* 0008bbb0 - 0008c3af [ 2048] */,
-    0x461d87ff /* 0008c3b0 - 0008cbaf [ 2048] */,
-    0x465d87ff /* 0008cbb0 - 0008d3af [ 2048] */,
-    0x469d87ff /* 0008d3b0 - 0008dbaf [ 2048] */,
-    0x46dd87ff /* 0008dbb0 - 0008e3af [ 2048] */,
-    0x471d87ff /* 0008e3b0 - 0008ebaf [ 2048] */,
-    0x475d87ff /* 0008ebb0 - 0008f3af [ 2048] */,
-    0x479d87ff /* 0008f3b0 - 0008fbaf [ 2048] */,
-    0x47dd87ff /* 0008fbb0 - 000903af [ 2048] */,
-    0x481d87ff /* 000903b0 - 00090baf [ 2048] */,
-    0x485d87ff /* 00090bb0 - 000913af [ 2048] */,
-    0x489d87ff /* 000913b0 - 00091baf [ 2048] */,
-    0x48dd87ff /* 00091bb0 - 000923af [ 2048] */,
-    0x491d87ff /* 000923b0 - 00092baf [ 2048] */,
-    0x495d87ff /* 00092bb0 - 000933af [ 2048] */,
-    0x499d87ff /* 000933b0 - 00093baf [ 2048] */,
-    0x49dd87ff /* 00093bb0 - 000943af [ 2048] */,
-    0x4a1d87ff /* 000943b0 - 00094baf [ 2048] */,
-    0x4a5d87ff /* 00094bb0 - 000953af [ 2048] */,
-    0x4a9d87ff /* 000953b0 - 00095baf [ 2048] */,
-    0x4add87ff /* 00095bb0 - 000963af [ 2048] */,
-    0x4b1d87ff /* 000963b0 - 00096baf [ 2048] */,
-    0x4b5d87ff /* 00096bb0 - 000973af [ 2048] */,
-    0x4b9d87ff /* 000973b0 - 00097baf [ 2048] */,
-    0x4bdd87ff /* 00097bb0 - 000983af [ 2048] */,
-    0x4c1d87ff /* 000983b0 - 00098baf [ 2048] */,
-    0x4c5d87ff /* 00098bb0 - 000993af [ 2048] */,
-    0x4c9d87ff /* 000993b0 - 00099baf [ 2048] */,
-    0x4cdd87ff /* 00099bb0 - 0009a3af [ 2048] */,
-    0x4d1d87ff /* 0009a3b0 - 0009abaf [ 2048] */,
-    0x4d5d87ff /* 0009abb0 - 0009b3af [ 2048] */,
-    0x4d9d87ff /* 0009b3b0 - 0009bbaf [ 2048] */,
-    0x4ddd87ff /* 0009bbb0 - 0009c3af [ 2048] */,
-    0x4e1d87ff /* 0009c3b0 - 0009cbaf [ 2048] */,
-    0x4e5d87ff /* 0009cbb0 - 0009d3af [ 2048] */,
-    0x4e9d87ff /* 0009d3b0 - 0009dbaf [ 2048] */,
-    0x4edd87ff /* 0009dbb0 - 0009e3af [ 2048] */,
-    0x4f1d87ff /* 0009e3b0 - 0009ebaf [ 2048] */,
-    0x4f5d87ff /* 0009ebb0 - 0009f3af [ 2048] */,
-    0x4f9d87ff /* 0009f3b0 - 0009fbaf [ 2048] */,
-    0x4fdd87ff /* 0009fbb0 - 000a03af [ 2048] */,
-    0x501d87ff /* 000a03b0 - 000a0baf [ 2048] */,
-    0x505d87ff /* 000a0bb0 - 000a13af [ 2048] */,
-    0x509d87ff /* 000a13b0 - 000a1baf [ 2048] */,
-    0x50dd87ff /* 000a1bb0 - 000a23af [ 2048] */,
-    0x511d87ff /* 000a23b0 - 000a2baf [ 2048] */,
-    0x515d87ff /* 000a2bb0 - 000a33af [ 2048] */,
-    0x519d87ff /* 000a33b0 - 000a3baf [ 2048] */,
-    0x51dd87ff /* 000a3bb0 - 000a43af [ 2048] */,
-    0x521d87ff /* 000a43b0 - 000a4baf [ 2048] */,
-    0x525d87ff /* 000a4bb0 - 000a53af [ 2048] */,
-    0x529d87ff /* 000a53b0 - 000a5baf [ 2048] */,
-    0x52dd87ff /* 000a5bb0 - 000a63af [ 2048] */,
-    0x531d87ff /* 000a63b0 - 000a6baf [ 2048] */,
-    0x535d87ff /* 000a6bb0 - 000a73af [ 2048] */,
-    0x539d87ff /* 000a73b0 - 000a7baf [ 2048] */,
-    0x53dd87ff /* 000a7bb0 - 000a83af [ 2048] */,
-    0x541d87ff /* 000a83b0 - 000a8baf [ 2048] */,
-    0x545d87ff /* 000a8bb0 - 000a93af [ 2048] */,
-    0x549d87ff /* 000a93b0 - 000a9baf [ 2048] */,
-    0x54dd87ff /* 000a9bb0 - 000aa3af [ 2048] */,
-    0x551d87ff /* 000aa3b0 - 000aabaf [ 2048] */,
-    0x555d87ff /* 000aabb0 - 000ab3af [ 2048] */,
-    0x559d87ff /* 000ab3b0 - 000abbaf [ 2048] */,
-    0x55dd87ff /* 000abbb0 - 000ac3af [ 2048] */,
-    0x561d87ff /* 000ac3b0 - 000acbaf [ 2048] */,
-    0x565d87ff /* 000acbb0 - 000ad3af [ 2048] */,
-    0x569d87ff /* 000ad3b0 - 000adbaf [ 2048] */,
-    0x56dd87ff /* 000adbb0 - 000ae3af [ 2048] */,
-    0x571d87ff /* 000ae3b0 - 000aebaf [ 2048] */,
-    0x575d87ff /* 000aebb0 - 000af3af [ 2048] */,
-    0x579d87ff /* 000af3b0 - 000afbaf [ 2048] */,
-    0x57dd87ff /* 000afbb0 - 000b03af [ 2048] */,
-    0x581d87ff /* 000b03b0 - 000b0baf [ 2048] */,
-    0x585d87ff /* 000b0bb0 - 000b13af [ 2048] */,
-    0x589d87ff /* 000b13b0 - 000b1baf [ 2048] */,
-    0x58dd87ff /* 000b1bb0 - 000b23af [ 2048] */,
-    0x591d87ff /* 000b23b0 - 000b2baf [ 2048] */,
-    0x595d87ff /* 000b2bb0 - 000b33af [ 2048] */,
-    0x599d87ff /* 000b33b0 - 000b3baf [ 2048] */,
-    0x59dd87ff /* 000b3bb0 - 000b43af [ 2048] */,
-    0x5a1d87ff /* 000b43b0 - 000b4baf [ 2048] */,
-    0x5a5d87ff /* 000b4bb0 - 000b53af [ 2048] */,
-    0x5a9d87ff /* 000b53b0 - 000b5baf [ 2048] */,
-    0x5add87ff /* 000b5bb0 - 000b63af [ 2048] */,
-    0x5b1d87ff /* 000b63b0 - 000b6baf [ 2048] */,
-    0x5b5d87ff /* 000b6bb0 - 000b73af [ 2048] */,
-    0x5b9d87ff /* 000b73b0 - 000b7baf [ 2048] */,
-    0x5bdd87ff /* 000b7bb0 - 000b83af [ 2048] */,
-    0x5c1d87ff /* 000b83b0 - 000b8baf [ 2048] */,
-    0x5c5d87ff /* 000b8bb0 - 000b93af [ 2048] */,
-    0x5c9d87ff /* 000b93b0 - 000b9baf [ 2048] */,
-    0x5cdd87ff /* 000b9bb0 - 000ba3af [ 2048] */,
-    0x5d1d87ff /* 000ba3b0 - 000babaf [ 2048] */,
-    0x5d5d87ff /* 000babb0 - 000bb3af [ 2048] */,
-    0x5d9d87ff /* 000bb3b0 - 000bbbaf [ 2048] */,
-    0x5ddd87ff /* 000bbbb0 - 000bc3af [ 2048] */,
-    0x5e1d87ff /* 000bc3b0 - 000bcbaf [ 2048] */,
-    0x5e5d87ff /* 000bcbb0 - 000bd3af [ 2048] */,
-    0x5e9d87ff /* 000bd3b0 - 000bdbaf [ 2048] */,
-    0x5edd87ff /* 000bdbb0 - 000be3af [ 2048] */,
-    0x5f1d87ff /* 000be3b0 - 000bebaf [ 2048] */,
-    0x5f5d87ff /* 000bebb0 - 000bf3af [ 2048] */,
-    0x5f9d87ff /* 000bf3b0 - 000bfbaf [ 2048] */,
-    0x5fdd87ff /* 000bfbb0 - 000c03af [ 2048] */,
-    0x601d87ff /* 000c03b0 - 000c0baf [ 2048] */,
-    0x605d87ff /* 000c0bb0 - 000c13af [ 2048] */,
-    0x609d87ff /* 000c13b0 - 000c1baf [ 2048] */,
-    0x60dd87ff /* 000c1bb0 - 000c23af [ 2048] */,
-    0x611d87ff /* 000c23b0 - 000c2baf [ 2048] */,
-    0x615d87ff /* 000c2bb0 - 000c33af [ 2048] */,
-    0x619d87ff /* 000c33b0 - 000c3baf [ 2048] */,
-    0x61dd87ff /* 000c3bb0 - 000c43af [ 2048] */,
-    0x621d87ff /* 000c43b0 - 000c4baf [ 2048] */,
-    0x625d87ff /* 000c4bb0 - 000c53af [ 2048] */,
-    0x629d87ff /* 000c53b0 - 000c5baf [ 2048] */,
-    0x62dd87ff /* 000c5bb0 - 000c63af [ 2048] */,
-    0x631d87ff /* 000c63b0 - 000c6baf [ 2048] */,
-    0x635d87ff /* 000c6bb0 - 000c73af [ 2048] */,
-    0x639d87ff /* 000c73b0 - 000c7baf [ 2048] */,
-    0x63dd87ff /* 000c7bb0 - 000c83af [ 2048] */,
-    0x641d87ff /* 000c83b0 - 000c8baf [ 2048] */,
-    0x645d87ff /* 000c8bb0 - 000c93af [ 2048] */,
-    0x649d87ff /* 000c93b0 - 000c9baf [ 2048] */,
-    0x64dd87ff /* 000c9bb0 - 000ca3af [ 2048] */,
-    0x651d87ff /* 000ca3b0 - 000cabaf [ 2048] */,
-    0x655d87ff /* 000cabb0 - 000cb3af [ 2048] */,
-    0x659d87ff /* 000cb3b0 - 000cbbaf [ 2048] */,
-    0x65dd87ff /* 000cbbb0 - 000cc3af [ 2048] */,
-    0x661d87ff /* 000cc3b0 - 000ccbaf [ 2048] */,
-    0x665d87ff /* 000ccbb0 - 000cd3af [ 2048] */,
-    0x669d87ff /* 000cd3b0 - 000cdbaf [ 2048] */,
-    0x66dd87ff /* 000cdbb0 - 000ce3af [ 2048] */,
-    0x671d87ff /* 000ce3b0 - 000cebaf [ 2048] */,
-    0x675d87ff /* 000cebb0 - 000cf3af [ 2048] */,
-    0x679d87ff /* 000cf3b0 - 000cfbaf [ 2048] */,
-    0x67dd87ff /* 000cfbb0 - 000d03af [ 2048] */,
-    0x681d87ff /* 000d03b0 - 000d0baf [ 2048] */,
-    0x685d87ff /* 000d0bb0 - 000d13af [ 2048] */,
-    0x689d87ff /* 000d13b0 - 000d1baf [ 2048] */,
-    0x68dd87ff /* 000d1bb0 - 000d23af [ 2048] */,
-    0x691d87ff /* 000d23b0 - 000d2baf [ 2048] */,
-    0x695d87ff /* 000d2bb0 - 000d33af [ 2048] */,
-    0x699d87ff /* 000d33b0 - 000d3baf [ 2048] */,
-    0x69dd87ff /* 000d3bb0 - 000d43af [ 2048] */,
-    0x6a1d87ff /* 000d43b0 - 000d4baf [ 2048] */,
-    0x6a5d87ff /* 000d4bb0 - 000d53af [ 2048] */,
-    0x6a9d87ff /* 000d53b0 - 000d5baf [ 2048] */,
-    0x6add87ff /* 000d5bb0 - 000d63af [ 2048] */,
-    0x6b1d87ff /* 000d63b0 - 000d6baf [ 2048] */,
-    0x6b5d87ff /* 000d6bb0 - 000d73af [ 2048] */,
-    0x6b9d87ff /* 000d73b0 - 000d7baf [ 2048] */,
-    0x6bdd87ff /* 000d7bb0 - 000d83af [ 2048] */,
-    0x6c1d87ff /* 000d83b0 - 000d8baf [ 2048] */,
-    0x6c5d87ff /* 000d8bb0 - 000d93af [ 2048] */,
-    0x6c9d87ff /* 000d93b0 - 000d9baf [ 2048] */,
-    0x6cdd87ff /* 000d9bb0 - 000da3af [ 2048] */,
-    0x6d1d87ff /* 000da3b0 - 000dabaf [ 2048] */,
-    0x6d5d87ff /* 000dabb0 - 000db3af [ 2048] */,
-    0x6d9d87ff /* 000db3b0 - 000dbbaf [ 2048] */,
-    0x6ddd87ff /* 000dbbb0 - 000dc3af [ 2048] */,
-    0x6e1d87ff /* 000dc3b0 - 000dcbaf [ 2048] */,
-    0x6e5d87ff /* 000dcbb0 - 000dd3af [ 2048] */,
-    0x6e9d87ff /* 000dd3b0 - 000ddbaf [ 2048] */,
-    0x6edd87ff /* 000ddbb0 - 000de3af [ 2048] */,
-    0x6f1d87ff /* 000de3b0 - 000debaf [ 2048] */,
-    0x6f5d87ff /* 000debb0 - 000df3af [ 2048] */,
-    0x6f9d87ff /* 000df3b0 - 000dfbaf [ 2048] */,
-    0x6fdd854f /* 000dfbb0 - 000e00ff [ 1360] */};
+    0x001fc021 /* 0000007f - 000000a0 [   34] */,
+    0x002b4000 /* 000000ad - 000000ad [    1] */,
+    0x00de0001 /* 00000378 - 00000379 [    2] */,
+    0x00e00003 /* 00000380 - 00000383 [    4] */,
+    0x00e2c000 /* 0000038b - 0000038b [    1] */,
+    0x00e34000 /* 0000038d - 0000038d [    1] */,
+    0x00e88000 /* 000003a2 - 000003a2 [    1] */,
+    0x014c0000 /* 00000530 - 00000530 [    1] */,
+    0x0155c001 /* 00000557 - 00000558 [    2] */,
+    0x0162c001 /* 0000058b - 0000058c [    2] */,
+    0x01640000 /* 00000590 - 00000590 [    1] */,
+    0x01720007 /* 000005c8 - 000005cf [    8] */,
+    0x017ac003 /* 000005eb - 000005ee [    4] */,
+    0x017d4010 /* 000005f5 - 00000605 [   17] */,
+    0x01870000 /* 0000061c - 0000061c [    1] */,
+    0x01b74000 /* 000006dd - 000006dd [    1] */,
+    0x01c38001 /* 0000070e - 0000070f [    2] */,
+    0x01d2c001 /* 0000074b - 0000074c [    2] */,
+    0x01ec800d /* 000007b2 - 000007bf [   14] */,
+    0x01fec001 /* 000007fb - 000007fc [    2] */,
+    0x020b8001 /* 0000082e - 0000082f [    2] */,
+    0x020fc000 /* 0000083f - 0000083f [    1] */,
+    0x02170001 /* 0000085c - 0000085d [    2] */,
+    0x0217c000 /* 0000085f - 0000085f [    1] */,
+    0x021ac004 /* 0000086b - 0000086f [    5] */,
+    0x0223c008 /* 0000088f - 00000897 [    9] */,
+    0x02388000 /* 000008e2 - 000008e2 [    1] */,
+    0x02610000 /* 00000984 - 00000984 [    1] */,
+    0x02634001 /* 0000098d - 0000098e [    2] */,
+    0x02644001 /* 00000991 - 00000992 [    2] */,
+    0x026a4000 /* 000009a9 - 000009a9 [    1] */,
+    0x026c4000 /* 000009b1 - 000009b1 [    1] */,
+    0x026cc002 /* 000009b3 - 000009b5 [    3] */,
+    0x026e8001 /* 000009ba - 000009bb [    2] */,
+    0x02714001 /* 000009c5 - 000009c6 [    2] */,
+    0x02724001 /* 000009c9 - 000009ca [    2] */,
+    0x0273c007 /* 000009cf - 000009d6 [    8] */,
+    0x02760003 /* 000009d8 - 000009db [    4] */,
+    0x02778000 /* 000009de - 000009de [    1] */,
+    0x02790001 /* 000009e4 - 000009e5 [    2] */,
+    0x027fc001 /* 000009ff - 00000a00 [    2] */,
+    0x02810000 /* 00000a04 - 00000a04 [    1] */,
+    0x0282c003 /* 00000a0b - 00000a0e [    4] */,
+    0x02844001 /* 00000a11 - 00000a12 [    2] */,
+    0x028a4000 /* 00000a29 - 00000a29 [    1] */,
+    0x028c4000 /* 00000a31 - 00000a31 [    1] */,
+    0x028d0000 /* 00000a34 - 00000a34 [    1] */,
+    0x028dc000 /* 00000a37 - 00000a37 [    1] */,
+    0x028e8001 /* 00000a3a - 00000a3b [    2] */,
+    0x028f4000 /* 00000a3d - 00000a3d [    1] */,
+    0x0290c003 /* 00000a43 - 00000a46 [    4] */,
+    0x02924001 /* 00000a49 - 00000a4a [    2] */,
+    0x02938002 /* 00000a4e - 00000a50 [    3] */,
+    0x02948006 /* 00000a52 - 00000a58 [    7] */,
+    0x02974000 /* 00000a5d - 00000a5d [    1] */,
+    0x0297c006 /* 00000a5f - 00000a65 [    7] */,
+    0x029dc009 /* 00000a77 - 00000a80 [   10] */,
+    0x02a10000 /* 00000a84 - 00000a84 [    1] */,
+    0x02a38000 /* 00000a8e - 00000a8e [    1] */,
+    0x02a48000 /* 00000a92 - 00000a92 [    1] */,
+    0x02aa4000 /* 00000aa9 - 00000aa9 [    1] */,
+    0x02ac4000 /* 00000ab1 - 00000ab1 [    1] */,
+    0x02ad0000 /* 00000ab4 - 00000ab4 [    1] */,
+    0x02ae8001 /* 00000aba - 00000abb [    2] */,
+    0x02b18000 /* 00000ac6 - 00000ac6 [    1] */,
+    0x02b28000 /* 00000aca - 00000aca [    1] */,
+    0x02b38001 /* 00000ace - 00000acf [    2] */,
+    0x02b4400e /* 00000ad1 - 00000adf [   15] */,
+    0x02b90001 /* 00000ae4 - 00000ae5 [    2] */,
+    0x02bc8006 /* 00000af2 - 00000af8 [    7] */,
+    0x02c00000 /* 00000b00 - 00000b00 [    1] */,
+    0x02c10000 /* 00000b04 - 00000b04 [    1] */,
+    0x02c34001 /* 00000b0d - 00000b0e [    2] */,
+    0x02c44001 /* 00000b11 - 00000b12 [    2] */,
+    0x02ca4000 /* 00000b29 - 00000b29 [    1] */,
+    0x02cc4000 /* 00000b31 - 00000b31 [    1] */,
+    0x02cd0000 /* 00000b34 - 00000b34 [    1] */,
+    0x02ce8001 /* 00000b3a - 00000b3b [    2] */,
+    0x02d14001 /* 00000b45 - 00000b46 [    2] */,
+    0x02d24001 /* 00000b49 - 00000b4a [    2] */,
+    0x02d38006 /* 00000b4e - 00000b54 [    7] */,
+    0x02d60003 /* 00000b58 - 00000b5b [    4] */,
+    0x02d78000 /* 00000b5e - 00000b5e [    1] */,
+    0x02d90001 /* 00000b64 - 00000b65 [    2] */,
+    0x02de0009 /* 00000b78 - 00000b81 [   10] */,
+    0x02e10000 /* 00000b84 - 00000b84 [    1] */,
+    0x02e2c002 /* 00000b8b - 00000b8d [    3] */,
+    0x02e44000 /* 00000b91 - 00000b91 [    1] */,
+    0x02e58002 /* 00000b96 - 00000b98 [    3] */,
+    0x02e6c000 /* 00000b9b - 00000b9b [    1] */,
+    0x02e74000 /* 00000b9d - 00000b9d [    1] */,
+    0x02e80002 /* 00000ba0 - 00000ba2 [    3] */,
+    0x02e94002 /* 00000ba5 - 00000ba7 [    3] */,
+    0x02eac002 /* 00000bab - 00000bad [    3] */,
+    0x02ee8003 /* 00000bba - 00000bbd [    4] */,
+    0x02f0c002 /* 00000bc3 - 00000bc5 [    3] */,
+    0x02f24000 /* 00000bc9 - 00000bc9 [    1] */,
+    0x02f38001 /* 00000bce - 00000bcf [    2] */,
+    0x02f44005 /* 00000bd1 - 00000bd6 [    6] */,
+    0x02f6000d /* 00000bd8 - 00000be5 [   14] */,
+    0x02fec004 /* 00000bfb - 00000bff [    5] */,
+    0x03034000 /* 00000c0d - 00000c0d [    1] */,
+    0x03044000 /* 00000c11 - 00000c11 [    1] */,
+    0x030a4000 /* 00000c29 - 00000c29 [    1] */,
+    0x030e8001 /* 00000c3a - 00000c3b [    2] */,
+    0x03114000 /* 00000c45 - 00000c45 [    1] */,
+    0x03124000 /* 00000c49 - 00000c49 [    1] */,
+    0x03138006 /* 00000c4e - 00000c54 [    7] */,
+    0x0315c000 /* 00000c57 - 00000c57 [    1] */,
+    0x0316c001 /* 00000c5b - 00000c5c [    2] */,
+    0x03178001 /* 00000c5e - 00000c5f [    2] */,
+    0x03190001 /* 00000c64 - 00000c65 [    2] */,
+    0x031c0006 /* 00000c70 - 00000c76 [    7] */,
+    0x03234000 /* 00000c8d - 00000c8d [    1] */,
+    0x03244000 /* 00000c91 - 00000c91 [    1] */,
+    0x032a4000 /* 00000ca9 - 00000ca9 [    1] */,
+    0x032d0000 /* 00000cb4 - 00000cb4 [    1] */,
+    0x032e8001 /* 00000cba - 00000cbb [    2] */,
+    0x03314000 /* 00000cc5 - 00000cc5 [    1] */,
+    0x03324000 /* 00000cc9 - 00000cc9 [    1] */,
+    0x03338006 /* 00000cce - 00000cd4 [    7] */,
+    0x0335c005 /* 00000cd7 - 00000cdc [    6] */,
+    0x0337c000 /* 00000cdf - 00000cdf [    1] */,
+    0x03390001 /* 00000ce4 - 00000ce5 [    2] */,
+    0x033c0000 /* 00000cf0 - 00000cf0 [    1] */,
+    0x033d000b /* 00000cf4 - 00000cff [   12] */,
+    0x03434000 /* 00000d0d - 00000d0d [    1] */,
+    0x03444000 /* 00000d11 - 00000d11 [    1] */,
+    0x03514000 /* 00000d45 - 00000d45 [    1] */,
+    0x03524000 /* 00000d49 - 00000d49 [    1] */,
+    0x03540003 /* 00000d50 - 00000d53 [    4] */,
+    0x03590001 /* 00000d64 - 00000d65 [    2] */,
+    0x03600000 /* 00000d80 - 00000d80 [    1] */,
+    0x03610000 /* 00000d84 - 00000d84 [    1] */,
+    0x0365c002 /* 00000d97 - 00000d99 [    3] */,
+    0x036c8000 /* 00000db2 - 00000db2 [    1] */,
+    0x036f0000 /* 00000dbc - 00000dbc [    1] */,
+    0x036f8001 /* 00000dbe - 00000dbf [    2] */,
+    0x0371c002 /* 00000dc7 - 00000dc9 [    3] */,
+    0x0372c003 /* 00000dcb - 00000dce [    4] */,
+    0x03754000 /* 00000dd5 - 00000dd5 [    1] */,
+    0x0375c000 /* 00000dd7 - 00000dd7 [    1] */,
+    0x03780005 /* 00000de0 - 00000de5 [    6] */,
+    0x037c0001 /* 00000df0 - 00000df1 [    2] */,
+    0x037d400b /* 00000df5 - 00000e00 [   12] */,
+    0x038ec003 /* 00000e3b - 00000e3e [    4] */,
+    0x03970024 /* 00000e5c - 00000e80 [   37] */,
+    0x03a0c000 /* 00000e83 - 00000e83 [    1] */,
+    0x03a14000 /* 00000e85 - 00000e85 [    1] */,
+    0x03a2c000 /* 00000e8b - 00000e8b [    1] */,
+    0x03a90000 /* 00000ea4 - 00000ea4 [    1] */,
+    0x03a98000 /* 00000ea6 - 00000ea6 [    1] */,
+    0x03af8001 /* 00000ebe - 00000ebf [    2] */,
+    0x03b14000 /* 00000ec5 - 00000ec5 [    1] */,
+    0x03b1c000 /* 00000ec7 - 00000ec7 [    1] */,
+    0x03b3c000 /* 00000ecf - 00000ecf [    1] */,
+    0x03b68001 /* 00000eda - 00000edb [    2] */,
+    0x03b8001f /* 00000ee0 - 00000eff [   32] */,
+    0x03d20000 /* 00000f48 - 00000f48 [    1] */,
+    0x03db4003 /* 00000f6d - 00000f70 [    4] */,
+    0x03e60000 /* 00000f98 - 00000f98 [    1] */,
+    0x03ef4000 /* 00000fbd - 00000fbd [    1] */,
+    0x03f34000 /* 00000fcd - 00000fcd [    1] */,
+    0x03f6c024 /* 00000fdb - 00000fff [   37] */,
+    0x04318000 /* 000010c6 - 000010c6 [    1] */,
+    0x04320004 /* 000010c8 - 000010cc [    5] */,
+    0x04338001 /* 000010ce - 000010cf [    2] */,
+    0x04924000 /* 00001249 - 00001249 [    1] */,
+    0x04938001 /* 0000124e - 0000124f [    2] */,
+    0x0495c000 /* 00001257 - 00001257 [    1] */,
+    0x04964000 /* 00001259 - 00001259 [    1] */,
+    0x04978001 /* 0000125e - 0000125f [    2] */,
+    0x04a24000 /* 00001289 - 00001289 [    1] */,
+    0x04a38001 /* 0000128e - 0000128f [    2] */,
+    0x04ac4000 /* 000012b1 - 000012b1 [    1] */,
+    0x04ad8001 /* 000012b6 - 000012b7 [    2] */,
+    0x04afc000 /* 000012bf - 000012bf [    1] */,
+    0x04b04000 /* 000012c1 - 000012c1 [    1] */,
+    0x04b18001 /* 000012c6 - 000012c7 [    2] */,
+    0x04b5c000 /* 000012d7 - 000012d7 [    1] */,
+    0x04c44000 /* 00001311 - 00001311 [    1] */,
+    0x04c58001 /* 00001316 - 00001317 [    2] */,
+    0x04d6c001 /* 0000135b - 0000135c [    2] */,
+    0x04df4002 /* 0000137d - 0000137f [    3] */,
+    0x04e68005 /* 0000139a - 0000139f [    6] */,
+    0x04fd8001 /* 000013f6 - 000013f7 [    2] */,
+    0x04ff8001 /* 000013fe - 000013ff [    2] */,
+    0x05a00000 /* 00001680 - 00001680 [    1] */,
+    0x05a74002 /* 0000169d - 0000169f [    3] */,
+    0x05be4006 /* 000016f9 - 000016ff [    7] */,
+    0x05c58008 /* 00001716 - 0000171e [    9] */,
+    0x05cdc008 /* 00001737 - 0000173f [    9] */,
+    0x05d5000b /* 00001754 - 0000175f [   12] */,
+    0x05db4000 /* 0000176d - 0000176d [    1] */,
+    0x05dc4000 /* 00001771 - 00001771 [    1] */,
+    0x05dd000b /* 00001774 - 0000177f [   12] */,
+    0x05f78001 /* 000017de - 000017df [    2] */,
+    0x05fa8005 /* 000017ea - 000017ef [    6] */,
+    0x05fe8005 /* 000017fa - 000017ff [    6] */,
+    0x06038000 /* 0000180e - 0000180e [    1] */,
+    0x06068005 /* 0000181a - 0000181f [    6] */,
+    0x061e4006 /* 00001879 - 0000187f [    7] */,
+    0x062ac004 /* 000018ab - 000018af [    5] */,
+    0x063d8009 /* 000018f6 - 000018ff [   10] */,
+    0x0647c000 /* 0000191f - 0000191f [    1] */,
+    0x064b0003 /* 0000192c - 0000192f [    4] */,
+    0x064f0003 /* 0000193c - 0000193f [    4] */,
+    0x06504002 /* 00001941 - 00001943 [    3] */,
+    0x065b8001 /* 0000196e - 0000196f [    2] */,
+    0x065d400a /* 00001975 - 0000197f [   11] */,
+    0x066b0003 /* 000019ac - 000019af [    4] */,
+    0x06728005 /* 000019ca - 000019cf [    6] */,
+    0x0676c002 /* 000019db - 000019dd [    3] */,
+    0x06870001 /* 00001a1c - 00001a1d [    2] */,
+    0x0697c000 /* 00001a5f - 00001a5f [    1] */,
+    0x069f4001 /* 00001a7d - 00001a7e [    2] */,
+    0x06a28005 /* 00001a8a - 00001a8f [    6] */,
+    0x06a68005 /* 00001a9a - 00001a9f [    6] */,
+    0x06ab8001 /* 00001aae - 00001aaf [    2] */,
+    0x06b3c030 /* 00001acf - 00001aff [   49] */,
+    0x06d34002 /* 00001b4d - 00001b4f [    3] */,
+    0x06dfc000 /* 00001b7f - 00001b7f [    1] */,
+    0x06fd0007 /* 00001bf4 - 00001bfb [    8] */,
+    0x070e0002 /* 00001c38 - 00001c3a [    3] */,
+    0x07128002 /* 00001c4a - 00001c4c [    3] */,
+    0x07224006 /* 00001c89 - 00001c8f [    7] */,
+    0x072ec001 /* 00001cbb - 00001cbc [    2] */,
+    0x07320007 /* 00001cc8 - 00001ccf [    8] */,
+    0x073ec004 /* 00001cfb - 00001cff [    5] */,
+    0x07c58001 /* 00001f16 - 00001f17 [    2] */,
+    0x07c78001 /* 00001f1e - 00001f1f [    2] */,
+    0x07d18001 /* 00001f46 - 00001f47 [    2] */,
+    0x07d38001 /* 00001f4e - 00001f4f [    2] */,
+    0x07d60000 /* 00001f58 - 00001f58 [    1] */,
+    0x07d68000 /* 00001f5a - 00001f5a [    1] */,
+    0x07d70000 /* 00001f5c - 00001f5c [    1] */,
+    0x07d78000 /* 00001f5e - 00001f5e [    1] */,
+    0x07df8001 /* 00001f7e - 00001f7f [    2] */,
+    0x07ed4000 /* 00001fb5 - 00001fb5 [    1] */,
+    0x07f14000 /* 00001fc5 - 00001fc5 [    1] */,
+    0x07f50001 /* 00001fd4 - 00001fd5 [    2] */,
+    0x07f70000 /* 00001fdc - 00001fdc [    1] */,
+    0x07fc0001 /* 00001ff0 - 00001ff1 [    2] */,
+    0x07fd4000 /* 00001ff5 - 00001ff5 [    1] */,
+    0x07ffc010 /* 00001fff - 0000200f [   17] */,
+    0x080a0007 /* 00002028 - 0000202f [    8] */,
+    0x0817c010 /* 0000205f - 0000206f [   17] */,
+    0x081c8001 /* 00002072 - 00002073 [    2] */,
+    0x0823c000 /* 0000208f - 0000208f [    1] */,
+    0x08274002 /* 0000209d - 0000209f [    3] */,
+    0x0830400e /* 000020c1 - 000020cf [   15] */,
+    0x083c400e /* 000020f1 - 000020ff [   15] */,
+    0x08630003 /* 0000218c - 0000218f [    4] */,
+    0x0909c018 /* 00002427 - 0000243f [   25] */,
+    0x0912c014 /* 0000244b - 0000245f [   21] */,
+    0x0add0001 /* 00002b74 - 00002b75 [    2] */,
+    0x0ae58000 /* 00002b96 - 00002b96 [    1] */,
+    0x0b3d0004 /* 00002cf4 - 00002cf8 [    5] */,
+    0x0b498000 /* 00002d26 - 00002d26 [    1] */,
+    0x0b4a0004 /* 00002d28 - 00002d2c [    5] */,
+    0x0b4b8001 /* 00002d2e - 00002d2f [    2] */,
+    0x0b5a0006 /* 00002d68 - 00002d6e [    7] */,
+    0x0b5c400d /* 00002d71 - 00002d7e [   14] */,
+    0x0b65c008 /* 00002d97 - 00002d9f [    9] */,
+    0x0b69c000 /* 00002da7 - 00002da7 [    1] */,
+    0x0b6bc000 /* 00002daf - 00002daf [    1] */,
+    0x0b6dc000 /* 00002db7 - 00002db7 [    1] */,
+    0x0b6fc000 /* 00002dbf - 00002dbf [    1] */,
+    0x0b71c000 /* 00002dc7 - 00002dc7 [    1] */,
+    0x0b73c000 /* 00002dcf - 00002dcf [    1] */,
+    0x0b75c000 /* 00002dd7 - 00002dd7 [    1] */,
+    0x0b77c000 /* 00002ddf - 00002ddf [    1] */,
+    0x0b978021 /* 00002e5e - 00002e7f [   34] */,
+    0x0ba68000 /* 00002e9a - 00002e9a [    1] */,
+    0x0bbd000b /* 00002ef4 - 00002eff [   12] */,
+    0x0bf58019 /* 00002fd6 - 00002fef [   26] */,
+    0x0c000000 /* 00003000 - 00003000 [    1] */,
+    0x0c100000 /* 00003040 - 00003040 [    1] */,
+    0x0c25c001 /* 00003097 - 00003098 [    2] */,
+    0x0c400004 /* 00003100 - 00003104 [    5] */,
+    0x0c4c0000 /* 00003130 - 00003130 [    1] */,
+    0x0c63c000 /* 0000318f - 0000318f [    1] */,
+    0x0c79000a /* 000031e4 - 000031ee [   11] */,
+    0x0c87c000 /* 0000321f - 0000321f [    1] */,
+    0x29234002 /* 0000a48d - 0000a48f [    3] */,
+    0x2931c008 /* 0000a4c7 - 0000a4cf [    9] */,
+    0x298b0013 /* 0000a62c - 0000a63f [   20] */,
+    0x29be0007 /* 0000a6f8 - 0000a6ff [    8] */,
+    0x29f2c004 /* 0000a7cb - 0000a7cf [    5] */,
+    0x29f48000 /* 0000a7d2 - 0000a7d2 [    1] */,
+    0x29f50000 /* 0000a7d4 - 0000a7d4 [    1] */,
+    0x29f68017 /* 0000a7da - 0000a7f1 [   24] */,
+    0x2a0b4002 /* 0000a82d - 0000a82f [    3] */,
+    0x2a0e8005 /* 0000a83a - 0000a83f [    6] */,
+    0x2a1e0007 /* 0000a878 - 0000a87f [    8] */,
+    0x2a318007 /* 0000a8c6 - 0000a8cd [    8] */,
+    0x2a368005 /* 0000a8da - 0000a8df [    6] */,
+    0x2a55000a /* 0000a954 - 0000a95e [   11] */,
+    0x2a5f4002 /* 0000a97d - 0000a97f [    3] */,
+    0x2a738000 /* 0000a9ce - 0000a9ce [    1] */,
+    0x2a768003 /* 0000a9da - 0000a9dd [    4] */,
+    0x2a7fc000 /* 0000a9ff - 0000a9ff [    1] */,
+    0x2a8dc008 /* 0000aa37 - 0000aa3f [    9] */,
+    0x2a938001 /* 0000aa4e - 0000aa4f [    2] */,
+    0x2a968001 /* 0000aa5a - 0000aa5b [    2] */,
+    0x2ab0c017 /* 0000aac3 - 0000aada [   24] */,
+    0x2abdc009 /* 0000aaf7 - 0000ab00 [   10] */,
+    0x2ac1c001 /* 0000ab07 - 0000ab08 [    2] */,
+    0x2ac3c001 /* 0000ab0f - 0000ab10 [    2] */,
+    0x2ac5c008 /* 0000ab17 - 0000ab1f [    9] */,
+    0x2ac9c000 /* 0000ab27 - 0000ab27 [    1] */,
+    0x2acbc000 /* 0000ab2f - 0000ab2f [    1] */,
+    0x2adb0003 /* 0000ab6c - 0000ab6f [    4] */,
+    0x2afb8001 /* 0000abee - 0000abef [    2] */,
+    0x2afe8005 /* 0000abfa - 0000abff [    6] */,
+    0x35e9000b /* 0000d7a4 - 0000d7af [   12] */,
+    0x35f1c003 /* 0000d7c7 - 0000d7ca [    4] */,
+    0x35ff2103 /* 0000d7fc - 0000f8ff [ 8452] */,
+    0x3e9b8001 /* 0000fa6e - 0000fa6f [    2] */,
+    0x3eb68025 /* 0000fada - 0000faff [   38] */,
+    0x3ec1c00b /* 0000fb07 - 0000fb12 [   12] */,
+    0x3ec60004 /* 0000fb18 - 0000fb1c [    5] */,
+    0x3ecdc000 /* 0000fb37 - 0000fb37 [    1] */,
+    0x3ecf4000 /* 0000fb3d - 0000fb3d [    1] */,
+    0x3ecfc000 /* 0000fb3f - 0000fb3f [    1] */,
+    0x3ed08000 /* 0000fb42 - 0000fb42 [    1] */,
+    0x3ed14000 /* 0000fb45 - 0000fb45 [    1] */,
+    0x3ef0c00f /* 0000fbc3 - 0000fbd2 [   16] */,
+    0x3f640001 /* 0000fd90 - 0000fd91 [    2] */,
+    0x3f720006 /* 0000fdc8 - 0000fdce [    7] */,
+    0x3f74001f /* 0000fdd0 - 0000fdef [   32] */,
+    0x3f868005 /* 0000fe1a - 0000fe1f [    6] */,
+    0x3f94c000 /* 0000fe53 - 0000fe53 [    1] */,
+    0x3f99c000 /* 0000fe67 - 0000fe67 [    1] */,
+    0x3f9b0003 /* 0000fe6c - 0000fe6f [    4] */,
+    0x3f9d4000 /* 0000fe75 - 0000fe75 [    1] */,
+    0x3fbf4003 /* 0000fefd - 0000ff00 [    4] */,
+    0x3fefc002 /* 0000ffbf - 0000ffc1 [    3] */,
+    0x3ff20001 /* 0000ffc8 - 0000ffc9 [    2] */,
+    0x3ff40001 /* 0000ffd0 - 0000ffd1 [    2] */,
+    0x3ff60001 /* 0000ffd8 - 0000ffd9 [    2] */,
+    0x3ff74002 /* 0000ffdd - 0000ffdf [    3] */,
+    0x3ff9c000 /* 0000ffe7 - 0000ffe7 [    1] */,
+    0x3ffbc00c /* 0000ffef - 0000fffb [   13] */,
+    0x3fff8001 /* 0000fffe - 0000ffff [    2] */,
+    0x40030000 /* 0001000c - 0001000c [    1] */,
+    0x4009c000 /* 00010027 - 00010027 [    1] */,
+    0x400ec000 /* 0001003b - 0001003b [    1] */,
+    0x400f8000 /* 0001003e - 0001003e [    1] */,
+    0x40138001 /* 0001004e - 0001004f [    2] */,
+    0x40178021 /* 0001005e - 0001007f [   34] */,
+    0x403ec004 /* 000100fb - 000100ff [    5] */,
+    0x4040c003 /* 00010103 - 00010106 [    4] */,
+    0x404d0002 /* 00010134 - 00010136 [    3] */,
+    0x4063c000 /* 0001018f - 0001018f [    1] */,
+    0x40674002 /* 0001019d - 0001019f [    3] */,
+    0x4068402e /* 000101a1 - 000101cf [   47] */,
+    0x407f8081 /* 000101fe - 0001027f [  130] */,
+    0x40a74002 /* 0001029d - 0001029f [    3] */,
+    0x40b4400e /* 000102d1 - 000102df [   15] */,
+    0x40bf0003 /* 000102fc - 000102ff [    4] */,
+    0x40c90008 /* 00010324 - 0001032c [    9] */,
+    0x40d2c004 /* 0001034b - 0001034f [    5] */,
+    0x40dec004 /* 0001037b - 0001037f [    5] */,
+    0x40e78000 /* 0001039e - 0001039e [    1] */,
+    0x40f10003 /* 000103c4 - 000103c7 [    4] */,
+    0x40f58029 /* 000103d6 - 000103ff [   42] */,
+    0x41278001 /* 0001049e - 0001049f [    2] */,
+    0x412a8005 /* 000104aa - 000104af [    6] */,
+    0x41350003 /* 000104d4 - 000104d7 [    4] */,
+    0x413f0003 /* 000104fc - 000104ff [    4] */,
+    0x414a0007 /* 00010528 - 0001052f [    8] */,
+    0x4159000a /* 00010564 - 0001056e [   11] */,
+    0x415ec000 /* 0001057b - 0001057b [    1] */,
+    0x4162c000 /* 0001058b - 0001058b [    1] */,
+    0x4164c000 /* 00010593 - 00010593 [    1] */,
+    0x41658000 /* 00010596 - 00010596 [    1] */,
+    0x41688000 /* 000105a2 - 000105a2 [    1] */,
+    0x416c8000 /* 000105b2 - 000105b2 [    1] */,
+    0x416e8000 /* 000105ba - 000105ba [    1] */,
+    0x416f4042 /* 000105bd - 000105ff [   67] */,
+    0x41cdc008 /* 00010737 - 0001073f [    9] */,
+    0x41d58009 /* 00010756 - 0001075f [   10] */,
+    0x41da0017 /* 00010768 - 0001077f [   24] */,
+    0x41e18000 /* 00010786 - 00010786 [    1] */,
+    0x41ec4000 /* 000107b1 - 000107b1 [    1] */,
+    0x41eec044 /* 000107bb - 000107ff [   69] */,
+    0x42018001 /* 00010806 - 00010807 [    2] */,
+    0x42024000 /* 00010809 - 00010809 [    1] */,
+    0x420d8000 /* 00010836 - 00010836 [    1] */,
+    0x420e4002 /* 00010839 - 0001083b [    3] */,
+    0x420f4001 /* 0001083d - 0001083e [    2] */,
+    0x42158000 /* 00010856 - 00010856 [    1] */,
+    0x4227c007 /* 0001089f - 000108a6 [    8] */,
+    0x422c002f /* 000108b0 - 000108df [   48] */,
+    0x423cc000 /* 000108f3 - 000108f3 [    1] */,
+    0x423d8004 /* 000108f6 - 000108fa [    5] */,
+    0x42470002 /* 0001091c - 0001091e [    3] */,
+    0x424e8004 /* 0001093a - 0001093e [    5] */,
+    0x4250003f /* 00010940 - 0001097f [   64] */,
+    0x426e0003 /* 000109b8 - 000109bb [    4] */,
+    0x42740001 /* 000109d0 - 000109d1 [    2] */,
+    0x42810000 /* 00010a04 - 00010a04 [    1] */,
+    0x4281c004 /* 00010a07 - 00010a0b [    5] */,
+    0x42850000 /* 00010a14 - 00010a14 [    1] */,
+    0x42860000 /* 00010a18 - 00010a18 [    1] */,
+    0x428d8001 /* 00010a36 - 00010a37 [    2] */,
+    0x428ec003 /* 00010a3b - 00010a3e [    4] */,
+    0x42924006 /* 00010a49 - 00010a4f [    7] */,
+    0x42964006 /* 00010a59 - 00010a5f [    7] */,
+    0x42a8001f /* 00010aa0 - 00010abf [   32] */,
+    0x42b9c003 /* 00010ae7 - 00010aea [    4] */,
+    0x42bdc008 /* 00010af7 - 00010aff [    9] */,
+    0x42cd8002 /* 00010b36 - 00010b38 [    3] */,
+    0x42d58001 /* 00010b56 - 00010b57 [    2] */,
+    0x42dcc004 /* 00010b73 - 00010b77 [    5] */,
+    0x42e48006 /* 00010b92 - 00010b98 [    7] */,
+    0x42e7400b /* 00010b9d - 00010ba8 [   12] */,
+    0x42ec004f /* 00010bb0 - 00010bff [   80] */,
+    0x43124036 /* 00010c49 - 00010c7f [   55] */,
+    0x432cc00c /* 00010cb3 - 00010cbf [   13] */,
+    0x433cc006 /* 00010cf3 - 00010cf9 [    7] */,
+    0x434a0007 /* 00010d28 - 00010d2f [    8] */,
+    0x434e8125 /* 00010d3a - 00010e5f [  294] */,
+    0x439fc000 /* 00010e7f - 00010e7f [    1] */,
+    0x43aa8000 /* 00010eaa - 00010eaa [    1] */,
+    0x43ab8001 /* 00010eae - 00010eaf [    2] */,
+    0x43ac804a /* 00010eb2 - 00010efc [   75] */,
+    0x43ca0007 /* 00010f28 - 00010f2f [    8] */,
+    0x43d68015 /* 00010f5a - 00010f6f [   22] */,
+    0x43e28025 /* 00010f8a - 00010faf [   38] */,
+    0x43f30013 /* 00010fcc - 00010fdf [   20] */,
+    0x43fdc008 /* 00010ff7 - 00010fff [    9] */,
+    0x44138003 /* 0001104e - 00011051 [    4] */,
+    0x441d8008 /* 00011076 - 0001107e [    9] */,
+    0x442f4000 /* 000110bd - 000110bd [    1] */,
+    0x4430c00c /* 000110c3 - 000110cf [   13] */,
+    0x443a4006 /* 000110e9 - 000110ef [    7] */,
+    0x443e8005 /* 000110fa - 000110ff [    6] */,
+    0x444d4000 /* 00011135 - 00011135 [    1] */,
+    0x44520007 /* 00011148 - 0001114f [    8] */,
+    0x445dc008 /* 00011177 - 0001117f [    9] */,
+    0x44780000 /* 000111e0 - 000111e0 [    1] */,
+    0x447d400a /* 000111f5 - 000111ff [   11] */,
+    0x44848000 /* 00011212 - 00011212 [    1] */,
+    0x4490803d /* 00011242 - 0001127f [   62] */,
+    0x44a1c000 /* 00011287 - 00011287 [    1] */,
+    0x44a24000 /* 00011289 - 00011289 [    1] */,
+    0x44a38000 /* 0001128e - 0001128e [    1] */,
+    0x44a78000 /* 0001129e - 0001129e [    1] */,
+    0x44aa8005 /* 000112aa - 000112af [    6] */,
+    0x44bac004 /* 000112eb - 000112ef [    5] */,
+    0x44be8005 /* 000112fa - 000112ff [    6] */,
+    0x44c10000 /* 00011304 - 00011304 [    1] */,
+    0x44c34001 /* 0001130d - 0001130e [    2] */,
+    0x44c44001 /* 00011311 - 00011312 [    2] */,
+    0x44ca4000 /* 00011329 - 00011329 [    1] */,
+    0x44cc4000 /* 00011331 - 00011331 [    1] */,
+    0x44cd0000 /* 00011334 - 00011334 [    1] */,
+    0x44ce8000 /* 0001133a - 0001133a [    1] */,
+    0x44d14001 /* 00011345 - 00011346 [    2] */,
+    0x44d24001 /* 00011349 - 0001134a [    2] */,
+    0x44d38001 /* 0001134e - 0001134f [    2] */,
+    0x44d44005 /* 00011351 - 00011356 [    6] */,
+    0x44d60004 /* 00011358 - 0001135c [    5] */,
+    0x44d90001 /* 00011364 - 00011365 [    2] */,
+    0x44db4002 /* 0001136d - 0001136f [    3] */,
+    0x44dd408a /* 00011375 - 000113ff [  139] */,
+    0x45170000 /* 0001145c - 0001145c [    1] */,
+    0x4518801d /* 00011462 - 0001147f [   30] */,
+    0x45320007 /* 000114c8 - 000114cf [    8] */,
+    0x453680a5 /* 000114da - 0001157f [  166] */,
+    0x456d8001 /* 000115b6 - 000115b7 [    2] */,
+    0x45778021 /* 000115de - 000115ff [   34] */,
+    0x4591400a /* 00011645 - 0001164f [   11] */,
+    0x45968005 /* 0001165a - 0001165f [    6] */,
+    0x459b4012 /* 0001166d - 0001167f [   19] */,
+    0x45ae8005 /* 000116ba - 000116bf [    6] */,
+    0x45b28035 /* 000116ca - 000116ff [   54] */,
+    0x45c6c001 /* 0001171b - 0001171c [    2] */,
+    0x45cb0003 /* 0001172c - 0001172f [    4] */,
+    0x45d1c0b8 /* 00011747 - 000117ff [  185] */,
+    0x460f0063 /* 0001183c - 0001189f [  100] */,
+    0x463cc00b /* 000118f3 - 000118fe [   12] */,
+    0x4641c001 /* 00011907 - 00011908 [    2] */,
+    0x46428001 /* 0001190a - 0001190b [    2] */,
+    0x46450000 /* 00011914 - 00011914 [    1] */,
+    0x4645c000 /* 00011917 - 00011917 [    1] */,
+    0x464d8000 /* 00011936 - 00011936 [    1] */,
+    0x464e4001 /* 00011939 - 0001193a [    2] */,
+    0x4651c008 /* 00011947 - 0001194f [    9] */,
+    0x46568045 /* 0001195a - 0001199f [   70] */,
+    0x466a0001 /* 000119a8 - 000119a9 [    2] */,
+    0x46760001 /* 000119d8 - 000119d9 [    2] */,
+    0x4679401a /* 000119e5 - 000119ff [   27] */,
+    0x46920007 /* 00011a48 - 00011a4f [    8] */,
+    0x46a8c00c /* 00011aa3 - 00011aaf [   13] */,
+    0x46be4006 /* 00011af9 - 00011aff [    7] */,
+    0x46c280f5 /* 00011b0a - 00011bff [  246] */,
+    0x47024000 /* 00011c09 - 00011c09 [    1] */,
+    0x470dc000 /* 00011c37 - 00011c37 [    1] */,
+    0x47118009 /* 00011c46 - 00011c4f [   10] */,
+    0x471b4002 /* 00011c6d - 00011c6f [    3] */,
+    0x47240001 /* 00011c90 - 00011c91 [    2] */,
+    0x472a0000 /* 00011ca8 - 00011ca8 [    1] */,
+    0x472dc048 /* 00011cb7 - 00011cff [   73] */,
+    0x4741c000 /* 00011d07 - 00011d07 [    1] */,
+    0x47428000 /* 00011d0a - 00011d0a [    1] */,
+    0x474dc002 /* 00011d37 - 00011d39 [    3] */,
+    0x474ec000 /* 00011d3b - 00011d3b [    1] */,
+    0x474f8000 /* 00011d3e - 00011d3e [    1] */,
+    0x47520007 /* 00011d48 - 00011d4f [    8] */,
+    0x47568005 /* 00011d5a - 00011d5f [    6] */,
+    0x47598000 /* 00011d66 - 00011d66 [    1] */,
+    0x475a4000 /* 00011d69 - 00011d69 [    1] */,
+    0x4763c000 /* 00011d8f - 00011d8f [    1] */,
+    0x47648000 /* 00011d92 - 00011d92 [    1] */,
+    0x47664006 /* 00011d99 - 00011d9f [    7] */,
+    0x476a8135 /* 00011daa - 00011edf [  310] */,
+    0x47be4006 /* 00011ef9 - 00011eff [    7] */,
+    0x47c44000 /* 00011f11 - 00011f11 [    1] */,
+    0x47cec002 /* 00011f3b - 00011f3d [    3] */,
+    0x47d68055 /* 00011f5a - 00011faf [   86] */,
+    0x47ec400e /* 00011fb1 - 00011fbf [   15] */,
+    0x47fc800c /* 00011ff2 - 00011ffe [   13] */,
+    0x48e68065 /* 0001239a - 000123ff [  102] */,
+    0x491bc000 /* 0001246f - 0001246f [    1] */,
+    0x491d400a /* 00012475 - 0001247f [   11] */,
+    0x49510a4b /* 00012544 - 00012f8f [ 2636] */,
+    0x4bfcc00c /* 00012ff3 - 00012fff [   13] */,
+    0x4d0c000f /* 00013430 - 0001343f [   16] */,
+    0x4d158fa9 /* 00013456 - 000143ff [ 4010] */,
+    0x5191e1b8 /* 00014647 - 000167ff [ 8633] */,
+    0x5a8e4006 /* 00016a39 - 00016a3f [    7] */,
+    0x5a97c000 /* 00016a5f - 00016a5f [    1] */,
+    0x5a9a8003 /* 00016a6a - 00016a6d [    4] */,
+    0x5aafc000 /* 00016abf - 00016abf [    1] */,
+    0x5ab28005 /* 00016aca - 00016acf [    6] */,
+    0x5abb8001 /* 00016aee - 00016aef [    2] */,
+    0x5abd8009 /* 00016af6 - 00016aff [   10] */,
+    0x5ad18009 /* 00016b46 - 00016b4f [   10] */,
+    0x5ad68000 /* 00016b5a - 00016b5a [    1] */,
+    0x5ad88000 /* 00016b62 - 00016b62 [    1] */,
+    0x5ade0004 /* 00016b78 - 00016b7c [    5] */,
+    0x5ae402af /* 00016b90 - 00016e3f [  688] */,
+    0x5ba6c064 /* 00016e9b - 00016eff [  101] */,
+    0x5bd2c003 /* 00016f4b - 00016f4e [    4] */,
+    0x5be20006 /* 00016f88 - 00016f8e [    7] */,
+    0x5be8003f /* 00016fa0 - 00016fdf [   64] */,
+    0x5bf9400a /* 00016fe5 - 00016fef [   11] */,
+    0x5bfc800d /* 00016ff2 - 00016fff [   14] */,
+    0x61fe0007 /* 000187f8 - 000187ff [    8] */,
+    0x63358029 /* 00018cd6 - 00018cff [   42] */,
+    0x634262e6 /* 00018d09 - 0001afef [ 8935] */,
+    0x6bfd0000 /* 0001aff4 - 0001aff4 [    1] */,
+    0x6bff0000 /* 0001affc - 0001affc [    1] */,
+    0x6bffc000 /* 0001afff - 0001afff [    1] */,
+    0x6c48c00e /* 0001b123 - 0001b131 [   15] */,
+    0x6c4cc01c /* 0001b133 - 0001b14f [   29] */,
+    0x6c54c001 /* 0001b153 - 0001b154 [    2] */,
+    0x6c55800d /* 0001b156 - 0001b163 [   14] */,
+    0x6c5a0007 /* 0001b168 - 0001b16f [    8] */,
+    0x6cbf0903 /* 0001b2fc - 0001bbff [ 2308] */,
+    0x6f1ac004 /* 0001bc6b - 0001bc6f [    5] */,
+    0x6f1f4002 /* 0001bc7d - 0001bc7f [    3] */,
+    0x6f224006 /* 0001bc89 - 0001bc8f [    7] */,
+    0x6f268001 /* 0001bc9a - 0001bc9b [    2] */,
+    0x6f28125f /* 0001bca0 - 0001ceff [ 4704] */,
+    0x73cb8001 /* 0001cf2e - 0001cf2f [    2] */,
+    0x73d1c008 /* 0001cf47 - 0001cf4f [    9] */,
+    0x73f1003b /* 0001cfc4 - 0001cfff [   60] */,
+    0x743d8009 /* 0001d0f6 - 0001d0ff [   10] */,
+    0x7449c001 /* 0001d127 - 0001d128 [    2] */,
+    0x745cc007 /* 0001d173 - 0001d17a [    8] */,
+    0x747ac014 /* 0001d1eb - 0001d1ff [   21] */,
+    0x74918079 /* 0001d246 - 0001d2bf [  122] */,
+    0x74b5000b /* 0001d2d4 - 0001d2df [   12] */,
+    0x74bd000b /* 0001d2f4 - 0001d2ff [   12] */,
+    0x74d5c008 /* 0001d357 - 0001d35f [    9] */,
+    0x74de4086 /* 0001d379 - 0001d3ff [  135] */,
+    0x75154000 /* 0001d455 - 0001d455 [    1] */,
+    0x75274000 /* 0001d49d - 0001d49d [    1] */,
+    0x75280001 /* 0001d4a0 - 0001d4a1 [    2] */,
+    0x7528c001 /* 0001d4a3 - 0001d4a4 [    2] */,
+    0x7529c001 /* 0001d4a7 - 0001d4a8 [    2] */,
+    0x752b4000 /* 0001d4ad - 0001d4ad [    1] */,
+    0x752e8000 /* 0001d4ba - 0001d4ba [    1] */,
+    0x752f0000 /* 0001d4bc - 0001d4bc [    1] */,
+    0x75310000 /* 0001d4c4 - 0001d4c4 [    1] */,
+    0x75418000 /* 0001d506 - 0001d506 [    1] */,
+    0x7542c001 /* 0001d50b - 0001d50c [    2] */,
+    0x75454000 /* 0001d515 - 0001d515 [    1] */,
+    0x75474000 /* 0001d51d - 0001d51d [    1] */,
+    0x754e8000 /* 0001d53a - 0001d53a [    1] */,
+    0x754fc000 /* 0001d53f - 0001d53f [    1] */,
+    0x75514000 /* 0001d545 - 0001d545 [    1] */,
+    0x7551c002 /* 0001d547 - 0001d549 [    3] */,
+    0x75544000 /* 0001d551 - 0001d551 [    1] */,
+    0x75a98001 /* 0001d6a6 - 0001d6a7 [    2] */,
+    0x75f30001 /* 0001d7cc - 0001d7cd [    2] */,
+    0x76a3000e /* 0001da8c - 0001da9a [   15] */,
+    0x76a80000 /* 0001daa0 - 0001daa0 [    1] */,
+    0x76ac044f /* 0001dab0 - 0001deff [ 1104] */,
+    0x77c7c005 /* 0001df1f - 0001df24 [    6] */,
+    0x77cac0d4 /* 0001df2b - 0001dfff [  213] */,
+    0x7801c000 /* 0001e007 - 0001e007 [    1] */,
+    0x78064001 /* 0001e019 - 0001e01a [    2] */,
+    0x78088000 /* 0001e022 - 0001e022 [    1] */,
+    0x78094000 /* 0001e025 - 0001e025 [    1] */,
+    0x780ac004 /* 0001e02b - 0001e02f [    5] */,
+    0x781b8020 /* 0001e06e - 0001e08e [   33] */,
+    0x7824006f /* 0001e090 - 0001e0ff [  112] */,
+    0x784b4002 /* 0001e12d - 0001e12f [    3] */,
+    0x784f8001 /* 0001e13e - 0001e13f [    2] */,
+    0x78528003 /* 0001e14a - 0001e14d [    4] */,
+    0x7854013f /* 0001e150 - 0001e28f [  320] */,
+    0x78abc010 /* 0001e2af - 0001e2bf [   17] */,
+    0x78be8004 /* 0001e2fa - 0001e2fe [    5] */,
+    0x78c001cf /* 0001e300 - 0001e4cf [  464] */,
+    0x793e82e5 /* 0001e4fa - 0001e7df [  742] */,
+    0x79f9c000 /* 0001e7e7 - 0001e7e7 [    1] */,
+    0x79fb0000 /* 0001e7ec - 0001e7ec [    1] */,
+    0x79fbc000 /* 0001e7ef - 0001e7ef [    1] */,
+    0x79ffc000 /* 0001e7ff - 0001e7ff [    1] */,
+    0x7a314001 /* 0001e8c5 - 0001e8c6 [    2] */,
+    0x7a35c028 /* 0001e8d7 - 0001e8ff [   41] */,
+    0x7a530003 /* 0001e94c - 0001e94f [    4] */,
+    0x7a568003 /* 0001e95a - 0001e95d [    4] */,
+    0x7a580310 /* 0001e960 - 0001ec70 [  785] */,
+    0x7b2d404b /* 0001ecb5 - 0001ed00 [   76] */,
+    0x7b4f80c1 /* 0001ed3e - 0001edff [  194] */,
+    0x7b810000 /* 0001ee04 - 0001ee04 [    1] */,
+    0x7b880000 /* 0001ee20 - 0001ee20 [    1] */,
+    0x7b88c000 /* 0001ee23 - 0001ee23 [    1] */,
+    0x7b894001 /* 0001ee25 - 0001ee26 [    2] */,
+    0x7b8a0000 /* 0001ee28 - 0001ee28 [    1] */,
+    0x7b8cc000 /* 0001ee33 - 0001ee33 [    1] */,
+    0x7b8e0000 /* 0001ee38 - 0001ee38 [    1] */,
+    0x7b8e8000 /* 0001ee3a - 0001ee3a [    1] */,
+    0x7b8f0005 /* 0001ee3c - 0001ee41 [    6] */,
+    0x7b90c003 /* 0001ee43 - 0001ee46 [    4] */,
+    0x7b920000 /* 0001ee48 - 0001ee48 [    1] */,
+    0x7b928000 /* 0001ee4a - 0001ee4a [    1] */,
+    0x7b930000 /* 0001ee4c - 0001ee4c [    1] */,
+    0x7b940000 /* 0001ee50 - 0001ee50 [    1] */,
+    0x7b94c000 /* 0001ee53 - 0001ee53 [    1] */,
+    0x7b954001 /* 0001ee55 - 0001ee56 [    2] */,
+    0x7b960000 /* 0001ee58 - 0001ee58 [    1] */,
+    0x7b968000 /* 0001ee5a - 0001ee5a [    1] */,
+    0x7b970000 /* 0001ee5c - 0001ee5c [    1] */,
+    0x7b978000 /* 0001ee5e - 0001ee5e [    1] */,
+    0x7b980000 /* 0001ee60 - 0001ee60 [    1] */,
+    0x7b98c000 /* 0001ee63 - 0001ee63 [    1] */,
+    0x7b994001 /* 0001ee65 - 0001ee66 [    2] */,
+    0x7b9ac000 /* 0001ee6b - 0001ee6b [    1] */,
+    0x7b9cc000 /* 0001ee73 - 0001ee73 [    1] */,
+    0x7b9e0000 /* 0001ee78 - 0001ee78 [    1] */,
+    0x7b9f4000 /* 0001ee7d - 0001ee7d [    1] */,
+    0x7b9fc000 /* 0001ee7f - 0001ee7f [    1] */,
+    0x7ba28000 /* 0001ee8a - 0001ee8a [    1] */,
+    0x7ba70004 /* 0001ee9c - 0001eea0 [    5] */,
+    0x7ba90000 /* 0001eea4 - 0001eea4 [    1] */,
+    0x7baa8000 /* 0001eeaa - 0001eeaa [    1] */,
+    0x7baf0033 /* 0001eebc - 0001eeef [   52] */,
+    0x7bbc810d /* 0001eef2 - 0001efff [  270] */,
+    0x7c0b0003 /* 0001f02c - 0001f02f [    4] */,
+    0x7c25000b /* 0001f094 - 0001f09f [   12] */,
+    0x7c2bc001 /* 0001f0af - 0001f0b0 [    2] */,
+    0x7c300000 /* 0001f0c0 - 0001f0c0 [    1] */,
+    0x7c340000 /* 0001f0d0 - 0001f0d0 [    1] */,
+    0x7c3d8009 /* 0001f0f6 - 0001f0ff [   10] */,
+    0x7c6b8037 /* 0001f1ae - 0001f1e5 [   56] */,
+    0x7c80c00c /* 0001f203 - 0001f20f [   13] */,
+    0x7c8f0003 /* 0001f23c - 0001f23f [    4] */,
+    0x7c924006 /* 0001f249 - 0001f24f [    7] */,
+    0x7c94800d /* 0001f252 - 0001f25f [   14] */,
+    0x7c998099 /* 0001f266 - 0001f2ff [  154] */,
+    0x7db60003 /* 0001f6d8 - 0001f6db [    4] */,
+    0x7dbb4002 /* 0001f6ed - 0001f6ef [    3] */,
+    0x7dbf4002 /* 0001f6fd - 0001f6ff [    3] */,
+    0x7dddc003 /* 0001f777 - 0001f77a [    4] */,
+    0x7df68005 /* 0001f7da - 0001f7df [    6] */,
+    0x7dfb0003 /* 0001f7ec - 0001f7ef [    4] */,
+    0x7dfc400e /* 0001f7f1 - 0001f7ff [   15] */,
+    0x7e030003 /* 0001f80c - 0001f80f [    4] */,
+    0x7e120007 /* 0001f848 - 0001f84f [    8] */,
+    0x7e168005 /* 0001f85a - 0001f85f [    6] */,
+    0x7e220007 /* 0001f888 - 0001f88f [    8] */,
+    0x7e2b8001 /* 0001f8ae - 0001f8af [    2] */,
+    0x7e2c804d /* 0001f8b2 - 0001f8ff [   78] */,
+    0x7e95000b /* 0001fa54 - 0001fa5f [   12] */,
+    0x7e9b8001 /* 0001fa6e - 0001fa6f [    2] */,
+    0x7e9f4002 /* 0001fa7d - 0001fa7f [    3] */,
+    0x7ea24006 /* 0001fa89 - 0001fa8f [    7] */,
+    0x7eaf8000 /* 0001fabe - 0001fabe [    1] */,
+    0x7eb18007 /* 0001fac6 - 0001facd [    8] */,
+    0x7eb70003 /* 0001fadc - 0001fadf [    4] */,
+    0x7eba4006 /* 0001fae9 - 0001faef [    7] */,
+    0x7ebe4006 /* 0001faf9 - 0001faff [    7] */,
+    0x7ee4c000 /* 0001fb93 - 0001fb93 [    1] */,
+    0x7ef2c024 /* 0001fbcb - 0001fbef [   37] */,
+    0x7efe8405 /* 0001fbfa - 0001ffff [ 1030] */,
+    0xa9b8001f /* 0002a6e0 - 0002a6ff [   32] */,
+    0xadce8005 /* 0002b73a - 0002b73f [    6] */,
+    0xae078001 /* 0002b81e - 0002b81f [    2] */,
+    0xb3a8800d /* 0002cea2 - 0002ceaf [   14] */,
+    0xbaf8400e /* 0002ebe1 - 0002ebef [   15] */,
+    0xbb9789a1 /* 0002ee5e - 0002f7ff [ 2466] */,
+    0xbe8785e1 /* 0002fa1e - 0002ffff [ 1506] */,
+    0xc4d2c004 /* 0003134b - 0003134f [    5] */};
 
+/// Returns whether the code unit needs to be escaped.
+///
 /// At the end of the valid Unicode code points space a lot of code points are
 /// either reserved or a noncharacter. Adding all these entries to the
-/// lookup table would add 446 entries to the table (in Unicode 14).
-/// Instead the only the start of the region is stored, every code point in
-/// this region needs to be escaped.
-_LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __unallocated_region_lower_bound = 0x000e01f0;
+/// lookup table would greatly increase the size of the table. Instead these
+/// entries are manually processed. In this large area of reserved code points,
+/// there is a small area of extended graphemes that should not be escaped
+/// unconditionally. This is also manually coded. See the generation script for
+/// more details.
 
-/// Returns whether the code unit needs to be escaped.
 ///
 /// \pre The code point is a valid Unicode code point.
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __needs_escape(const char32_t __code_point) noexcept {
-  // Since __unallocated_region_lower_bound contains the unshifted range do the
-  // comparison without shifting.
-  if (__code_point >= __unallocated_region_lower_bound)
+
+  // The entries in the gap at the end.
+  if(__code_point >= 0x000e0100 && __code_point <= 0x000e01ef)
+     return false;
+
+  // The entries at the end.
+  if (__code_point >= 0x000323b0)
     return true;
 
-  ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) | 0x7ffu) - __entries;
+  ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 14) | 0x3fffu) - __entries;
   if (__i == 0)
     return false;
 
   --__i;
-  uint32_t __upper_bound = (__entries[__i] >> 11) + (__entries[__i] & 0x7ffu);
+  uint32_t __upper_bound = (__entries[__i] >> 14) + (__entries[__i] & 0x3fffu);
   return __code_point <= __upper_bound;
 }
 
diff --git a/libcxx/include/__ranges/to.h b/libcxx/include/__ranges/to.h
index 8a815bce5811..e0abe6290b8f 100644
--- a/libcxx/include/__ranges/to.h
+++ b/libcxx/include/__ranges/to.h
@@ -24,6 +24,7 @@
 #include <__ranges/concepts.h>
 #include <__ranges/from_range.h>
 #include <__ranges/range_adaptor.h>
+#include <__ranges/ref_view.h>
 #include <__ranges/size.h>
 #include <__ranges/transform_view.h>
 #include <__type_traits/add_pointer.h>
@@ -129,7 +130,7 @@ template <class _Container, input_range _Range, class... _Args>
     // Try the recursive case.
   } else if constexpr (input_range<range_reference_t<_Range>>) {
     return ranges::to<_Container>(
-        __range | views::transform([](auto&& __elem) {
+        ref_view(__range) | views::transform([](auto&& __elem) {
           return ranges::to<range_value_t<_Container>>(std::forward<decltype(__elem)>(__elem));
         }),
         std::forward<_Args>(__args)...);
diff --git a/libcxx/include/__string/char_traits.h b/libcxx/include/__string/char_traits.h
index 1fd22d518e1a..9d347b188ee1 100644
--- a/libcxx/include/__string/char_traits.h
+++ b/libcxx/include/__string/char_traits.h
@@ -344,7 +344,7 @@ struct _LIBCPP_TEMPLATE_VIS char_traits<char16_t> {
   _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX17 const char_type*
   find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT {
     __identity __proj;
-    const char_type* __match = std::__find_impl(__s, __s + __n, __a, __proj);
+    const char_type* __match = std::__find(__s, __s + __n, __a, __proj);
     if (__match == __s + __n)
       return nullptr;
     return __match;
@@ -430,7 +430,7 @@ struct _LIBCPP_TEMPLATE_VIS char_traits<char32_t> {
   _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX17 const char_type*
   find(const char_type* __s, size_t __n, const char_type& __a) _NOEXCEPT {
     __identity __proj;
-    const char_type* __match = std::__find_impl(__s, __s + __n, __a, __proj);
+    const char_type* __match = std::__find(__s, __s + __n, __a, __proj);
     if (__match == __s + __n)
       return nullptr;
     return __match;
diff --git a/libcxx/include/__string/constexpr_c_functions.h b/libcxx/include/__string/constexpr_c_functions.h
index 72c6ce69b60b..4da8542e3807 100644
--- a/libcxx/include/__string/constexpr_c_functions.h
+++ b/libcxx/include/__string/constexpr_c_functions.h
@@ -224,7 +224,7 @@ __constexpr_memmove(_Tp* __dest, _Up* __src, __element_count __n) {
         std::__assign_trivially_copyable(__dest[__i], __src[__i]);
     }
   } else if (__count > 0) {
-    ::__builtin_memmove(__dest, __src, (__count - 1) * sizeof(_Tp) + __libcpp_datasizeof<_Tp>::value);
+    ::__builtin_memmove(__dest, __src, (__count - 1) * sizeof(_Tp) + __datasizeof_v<_Tp>);
   }
   return __dest;
 }
diff --git a/libcxx/include/__type_traits/datasizeof.h b/libcxx/include/__type_traits/datasizeof.h
index 3a8b15160107..54fde242ebcd 100644
--- a/libcxx/include/__type_traits/datasizeof.h
+++ b/libcxx/include/__type_traits/datasizeof.h
@@ -26,39 +26,38 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _Tp>
-struct __libcpp_datasizeof {
 #if __has_extension(datasizeof)
-  static const size_t value = __datasizeof(_Tp);
+template <class _Tp>
+inline const size_t __datasizeof_v = __datasizeof(_Tp);
 #else
 // NOLINTNEXTLINE(readability-redundant-preprocessor) This is https://llvm.org/PR64825
 #  if __has_cpp_attribute(__no_unique_address__)
-  template <class = char>
-  struct _FirstPaddingByte {
-    [[__no_unique_address__]] _Tp __v_;
-    char __first_padding_byte_;
-  };
+template <class _Tp>
+struct _FirstPaddingByte {
+  [[__no_unique_address__]] _Tp __v_;
+  char __first_padding_byte_;
+};
 #  else
-  template <bool = __libcpp_is_final<_Tp>::value || !is_class<_Tp>::value>
-  struct _FirstPaddingByte : _Tp {
-    char __first_padding_byte_;
-  };
+template <class _Tp, bool = __libcpp_is_final<_Tp>::value || !is_class<_Tp>::value>
+struct _FirstPaddingByte : _Tp {
+  char __first_padding_byte_;
+};
 
-  template <>
-  struct _FirstPaddingByte<true> {
-    _Tp __v_;
-    char __first_padding_byte_;
-  };
+template <class _Tp>
+struct _FirstPaddingByte<_Tp, true> {
+  _Tp __v_;
+  char __first_padding_byte_;
+};
 #  endif // __has_cpp_attribute(__no_unique_address__)
 
-  // _FirstPaddingByte<> is sometimes non-standard layout. Using `offsetof` is UB in that case, but GCC and Clang allow
-  // the use as an extension.
-  _LIBCPP_DIAGNOSTIC_PUSH
-  _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-offsetof")
-  static const size_t value = offsetof(_FirstPaddingByte<>, __first_padding_byte_);
-  _LIBCPP_DIAGNOSTIC_POP
+// _FirstPaddingByte<> is sometimes non-standard layout. Using `offsetof` is UB in that case, but GCC and Clang allow
+// the use as an extension.
+_LIBCPP_DIAGNOSTIC_PUSH
+_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-offsetof")
+template <class _Tp>
+inline const size_t __datasizeof_v = offsetof(_FirstPaddingByte<_Tp>, __first_padding_byte_);
+_LIBCPP_DIAGNOSTIC_POP
 #endif   // __has_extension(datasizeof)
-};
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__utility/no_destroy.h b/libcxx/include/__utility/no_destroy.h
index f9c1eb7bed45..8edd194577d7 100644
--- a/libcxx/include/__utility/no_destroy.h
+++ b/libcxx/include/__utility/no_destroy.h
@@ -12,6 +12,7 @@
 #include <__config>
 #include <__type_traits/is_constant_evaluated.h>
 #include <__utility/forward.h>
+#include <new>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -29,33 +30,23 @@ struct __uninitialized_tag {};
 // initialization using __emplace.
 template <class _Tp>
 struct __no_destroy {
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI explicit __no_destroy(__uninitialized_tag) : __dummy_() {
-    if (__libcpp_is_constant_evaluated()) {
-      __dummy_ = char();
-    }
-  }
-  _LIBCPP_HIDE_FROM_ABI ~__no_destroy() {
-    // nothing
-  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __no_destroy(__uninitialized_tag) : __obj_() {}
 
   template <class... _Args>
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI explicit __no_destroy(_Args&&... __args)
-      : __obj_(std::forward<_Args>(__args)...) {}
+  _LIBCPP_HIDE_FROM_ABI explicit __no_destroy(_Args&&... __args) {
+    ::new ((void*)__obj_) _Tp(std::forward<_Args>(__args)...);
+  }
 
   template <class... _Args>
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _Tp& __emplace(_Args&&... __args) {
-    new (&__obj_) _Tp(std::forward<_Args>(__args)...);
-    return __obj_;
+  _LIBCPP_HIDE_FROM_ABI _Tp& __emplace(_Args&&... __args) {
+    return *(::new ((void*)__obj_) _Tp(std::forward<_Args>(__args)...));
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _Tp& __get() { return __obj_; }
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _Tp const& __get() const { return __obj_; }
+  _LIBCPP_HIDE_FROM_ABI _Tp& __get() { return *reinterpret_cast<_Tp*>(__obj_); }
+  _LIBCPP_HIDE_FROM_ABI _Tp const& __get() const { return *reinterpret_cast<const _Tp*>(__obj_); }
 
 private:
-  union {
-    _Tp __obj_;
-    char __dummy_; // so we can initialize a member even with __uninitialized_tag for constexpr-friendliness
-  };
+  _ALIGNAS_TYPE(_Tp) char __obj_[sizeof(_Tp)];
 };
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/format b/libcxx/include/format
index f1e87de0f830..07c2ba083199 100644
--- a/libcxx/include/format
+++ b/libcxx/include/format
@@ -193,6 +193,8 @@ namespace std {
 #include <__format/concepts.h>
 #include <__format/container_adaptor.h>
 #include <__format/enable_insertable.h>
+#include <__format/escaped_output_table.h>
+#include <__format/extended_grapheme_cluster_table.h>
 #include <__format/format_arg.h>
 #include <__format/format_arg_store.h>
 #include <__format/format_args.h>
diff --git a/libcxx/modules/std/ranges.inc b/libcxx/modules/std/ranges.inc
index 7d215867a431..80f31c79a1a4 100644
--- a/libcxx/modules/std/ranges.inc
+++ b/libcxx/modules/std/ranges.inc
@@ -141,6 +141,17 @@ export namespace std {
 #if _LIBCPP_STD_VER >= 23
     // [range.adaptor.object], range adaptor objects
     using std::ranges::range_adaptor_closure;
+    // Note: This declaration not in the synopsis or explicitly in the wording.
+    // However it is needed for the range adaptors.
+    // [range.adaptor.object]/3
+    //   The template parameter D for range_adaptor_closure may be an
+    //   incomplete type. If an expression of type cv D is used as an operand
+    //   to the | operator, D shall be complete and model
+    //   derived_from<range_adaptor_closure<D>>. The behavior of an expression
+    //   involving an object of type cv D as an operand to the | operator is
+    //   undefined if overload resolution selects a program-defined operator|
+    //   function.
+    using std::ranges::operator|;
 #endif
 
     // [range.all], all view
diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv
index 6b80790a9d19..7d31ba160ee1 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx20.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv
@@ -129,6 +129,7 @@ chrono cwchar
 chrono forward_list
 chrono limits
 chrono locale
+chrono new
 chrono optional
 chrono ostream
 chrono ratio
diff --git a/libcxx/test/libcxx/type_traits/datasizeof.compile.pass.cpp b/libcxx/test/libcxx/type_traits/datasizeof.compile.pass.cpp
index 881b0bd85190..03dd0f6eac53 100644
--- a/libcxx/test/libcxx/type_traits/datasizeof.compile.pass.cpp
+++ b/libcxx/test/libcxx/type_traits/datasizeof.compile.pass.cpp
@@ -9,10 +9,10 @@
 #include <__type_traits/datasizeof.h>
 #include <cstdint>
 
-static_assert(std::__libcpp_datasizeof<std::int8_t>::value == 1, "");
-static_assert(std::__libcpp_datasizeof<std::int16_t>::value == 2, "");
-static_assert(std::__libcpp_datasizeof<std::int32_t>::value == 4, "");
-static_assert(std::__libcpp_datasizeof<std::int64_t>::value == 8, "");
+static_assert(std::__datasizeof_v<std::int8_t> == 1, "");
+static_assert(std::__datasizeof_v<std::int16_t> == 2, "");
+static_assert(std::__datasizeof_v<std::int32_t> == 4, "");
+static_assert(std::__datasizeof_v<std::int64_t> == 8, "");
 
 struct OneBytePadding {
   OneBytePadding() {}
@@ -22,9 +22,9 @@ struct OneBytePadding {
 };
 
 #if defined(_WIN32) && !defined(__MINGW32__)
-static_assert(std::__libcpp_datasizeof<OneBytePadding>::value == 4, "");
+static_assert(std::__datasizeof_v<OneBytePadding> == 4, "");
 #else
-static_assert(std::__libcpp_datasizeof<OneBytePadding>::value == 3, "");
+static_assert(std::__datasizeof_v<OneBytePadding> == 3, "");
 #endif
 
 struct InBetweenPadding {
@@ -35,4 +35,4 @@ struct InBetweenPadding {
   std::int16_t c;
 };
 
-static_assert(std::__libcpp_datasizeof<InBetweenPadding>::value == 8, "");
+static_assert(std::__datasizeof_v<InBetweenPadding> == 8, "");
diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp
index cf1909b92873..580c0f4ae10c 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp
@@ -47,28 +47,28 @@ static_assert(sizeof(std::expected<B, B>) == sizeof(B));
 
 // Check that `expected`'s datasize is large enough for the parameter type(s).
 static_assert(sizeof(std::expected<BoolWithPadding, Empty>) ==
-              std::__libcpp_datasizeof<std::expected<BoolWithPadding, Empty>>::value);
+              std::__datasizeof_v<std::expected<BoolWithPadding, Empty>>);
 static_assert(sizeof(std::expected<Empty, BoolWithPadding>) ==
-              std::__libcpp_datasizeof<std::expected<Empty, BoolWithPadding>>::value);
+              std::__datasizeof_v<std::expected<Empty, BoolWithPadding>>);
 
 // In this case, there should be tail padding in the `expected` because `A`
 // itself does _not_ have tail padding.
-static_assert(sizeof(std::expected<A, A>) > std::__libcpp_datasizeof<std::expected<A, A>>::value);
+static_assert(sizeof(std::expected<A, A>) > std::__datasizeof_v<std::expected<A, A>>);
 
 // Test with some real types.
 static_assert(sizeof(std::expected<std::optional<int>, int>) == 8);
-static_assert(std::__libcpp_datasizeof<std::expected<std::optional<int>, int>>::value == 8);
+static_assert(std::__datasizeof_v<std::expected<std::optional<int>, int>> == 8);
 
 static_assert(sizeof(std::expected<int, std::optional<int>>) == 8);
-static_assert(std::__libcpp_datasizeof<std::expected<int, std::optional<int>>>::value == 8);
+static_assert(std::__datasizeof_v<std::expected<int, std::optional<int>>> == 8);
 
 static_assert(sizeof(std::expected<int, int>) == 8);
-static_assert(std::__libcpp_datasizeof<std::expected<int, int>>::value == 5);
+static_assert(std::__datasizeof_v<std::expected<int, int>> == 5);
 
 // clang-format off
-static_assert(std::__libcpp_datasizeof<int>::value == 4);
-static_assert(std::__libcpp_datasizeof<std::expected<int, int>>::value == 5);
-static_assert(std::__libcpp_datasizeof<std::expected<std::expected<int, int>, int>>::value == 8);
-static_assert(std::__libcpp_datasizeof<std::expected<std::expected<std::expected<int, int>, int>, int>>::value == 9);
-static_assert(std::__libcpp_datasizeof<std::expected<std::expected<std::expected<std::expected<int, int>, int>, int>, int>>::value == 12);
+static_assert(std::__datasizeof_v<int> == 4);
+static_assert(std::__datasizeof_v<std::expected<int, int>> == 5);
+static_assert(std::__datasizeof_v<std::expected<std::expected<int, int>, int>> == 8);
+static_assert(std::__datasizeof_v<std::expected<std::expected<std::expected<int, int>, int>, int>> == 9);
+static_assert(std::__datasizeof_v<std::expected<std::expected<std::expected<std::expected<int, int>, int>, int>, int>> == 12);
 // clang-format on
diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp
index fdee8b71e5d9..27da03c54ac4 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp
@@ -45,23 +45,23 @@ static_assert(sizeof(std::expected<void, B>) == sizeof(B));
 
 // Check that `expected`'s datasize is large enough for the parameter type(s).
 static_assert(sizeof(std::expected<void, BoolWithPadding>) ==
-              std::__libcpp_datasizeof<std::expected<void, BoolWithPadding>>::value);
+              std::__datasizeof_v<std::expected<void, BoolWithPadding>>);
 
 // In this case, there should be tail padding in the `expected` because `A`
 // itself does _not_ have tail padding.
-static_assert(sizeof(std::expected<void, A>) > std::__libcpp_datasizeof<std::expected<void, A>>::value);
+static_assert(sizeof(std::expected<void, A>) > std::__datasizeof_v<std::expected<void, A>>);
 
 // Test with some real types.
 static_assert(sizeof(std::expected<void, std::optional<int>>) == 8);
-static_assert(std::__libcpp_datasizeof<std::expected<void, std::optional<int>>>::value == 8);
+static_assert(std::__datasizeof_v<std::expected<void, std::optional<int>>> == 8);
 
 static_assert(sizeof(std::expected<void, int>) == 8);
-static_assert(std::__libcpp_datasizeof<std::expected<void, int>>::value == 5);
+static_assert(std::__datasizeof_v<std::expected<void, int>> == 5);
 
 // clang-format off
-static_assert(std::__libcpp_datasizeof<int>::value == 4);
-static_assert(std::__libcpp_datasizeof<std::expected<void, int>>::value == 5);
-static_assert(std::__libcpp_datasizeof<std::expected<void, std::expected<void, int>>>::value == 8);
-static_assert(std::__libcpp_datasizeof<std::expected<void, std::expected<void, std::expected<void, int>>>>::value == 9);
-static_assert(std::__libcpp_datasizeof<std::expected<void, std::expected<void, std::expected<void, std::expected<void, int>>>>>::value == 12);
+static_assert(std::__datasizeof_v<int> == 4);
+static_assert(std::__datasizeof_v<std::expected<void, int>> == 5);
+static_assert(std::__datasizeof_v<std::expected<void, std::expected<void, int>>> == 8);
+static_assert(std::__datasizeof_v<std::expected<void, std::expected<void, std::expected<void, int>>>> == 9);
+static_assert(std::__datasizeof_v<std::expected<void, std::expected<void, std::expected<void, std::expected<void, int>>>>> == 12);
 // clang-format on
diff --git a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
new file mode 100644
index 000000000000..5b1191642c9a
--- /dev/null
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
@@ -0,0 +1,102 @@
+//===----------------------------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
+
+// <format>
+
+// Tests the properties of the Unicode escaped output table.
+// The libc++ algorithm has size and speed optimizations based on the properties
+// of Unicode. This means updating the Unicode tables has a likilihood of
+// breaking test. This is an assert; it requires validating whether the
+// assumptions of the size and speed optimizations are still valid.
+
+#include <algorithm>
+#include <numeric>
+#include <format>
+#include <cassert>
+
+// Contains the entries for [format.string.escaped]/2.2.1.2.1
+//   CE is a Unicode encoding and C corresponds to a UCS scalar value whose
+//   Unicode property General_Category has a value in the groups Separator (Z)
+//   or Other (C), as described by table 12 of UAX #44
+//
+// Separator (Z) consists of General_Category
+// - Zs Space_Separator,
+// - Zl Line_Separator,
+// - Zp Paragraph_Separator.
+//
+// Other (C) consists of General_Category
+// - Cc Control,
+// - Cf Format,
+// - Cs Surrogate,
+// - Co Private_Use,
+// - Cn Unassigned.
+inline constexpr int Zs = 17;
+inline constexpr int Zl = 1;
+inline constexpr int Zp = 1;
+inline constexpr int Z  = Zs + Zl + Zp;
+
+inline constexpr int Cc = 65;
+inline constexpr int Cf = 170;
+inline constexpr int Cs = 2'048;
+inline constexpr int Co = 137'468;
+inline constexpr int Cn = 824'718;
+inline constexpr int C  = Cc + Cf + Cs + Co + Cn;
+
+// This is the final part of the Unicode properties table:
+//
+// 31350..323AF  ; Lo # [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
+// 323B0..E0000  ; Cn # [711761] <reserved-323B0>..<reserved-E0000>
+// E0001         ; Cf #       LANGUAGE TAG
+// E0002..E001F  ; Cn #  [30] <reserved-E0002>..<reserved-E001F>
+// E0020..E007F  ; Cf #  [96] TAG SPACE..CANCEL TAG
+// E0080..E00FF  ; Cn # [128] <reserved-E0080>..<reserved-E00FF>
+// E0100..E01EF  ; Mn # [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
+// E01F0..EFFFF  ; Cn # [65040] <reserved-E01F0>..<noncharacter-EFFFF>
+// F0000..FFFFD  ; Co # [65534] <private-use-F0000>..<private-use-FFFFD>
+// FFFFE..FFFFF  ; Cn #   [2] <noncharacter-FFFFE>..<noncharacter-FFFFF>
+// 100000..10FFFD; Co # [65534] <private-use-100000>..<private-use-10FFFD>
+// 10FFFE..10FFFF; Cn #   [2] <noncharacter-10FFFE>..<noncharacter-10FFFF>
+//
+// It can be observed all entries in the range 323B0..10FFFF are in the
+// categories Cf, Co, Cn, except a small range with the property Mn.
+// In order to reduce the size of the table only the entires in the range
+// [0000, 323B0) are stored in the table. The entries in the range
+// [323B0, 10FFFF] use a hand-crafted algorithm.
+//
+// This means a number of entries are omitted
+inline constexpr int excluded = ((0x10FFFF - 0x323B0) + 1) - 240;
+
+inline constexpr int entries = Z + C - excluded;
+
+static constexpr int count_entries() {
+  return std::transform_reduce(
+      std::begin(std::__escaped_output_table::__entries),
+      std::end(std::__escaped_output_table::__entries),
+      0,
+      std::plus{},
+      [](auto entry) { return 1 + static_cast<int>(entry & 0x3fffu); });
+}
+static_assert(count_entries() == entries);
+
+int main(int, char**) {
+  for (char32_t c = 0x31350; c <= 0x323AF; ++c) // 31350..323AF  ; Lo # [4192]
+    assert(std::__escaped_output_table::__needs_escape(c) == false);
+
+  for (char32_t c = 0x323B0; c <= 0xE00FF; ++c) // 323B0..E00FF ; C
+    assert(std::__escaped_output_table::__needs_escape(c) == true);
+
+  for (char32_t c = 0xE0100; c <= 0xE01EF; ++c) // E0100..E01EF  ; Mn # [240]
+    assert(std::__escaped_output_table::__needs_escape(c) == false);
+
+  for (char32_t c = 0xE01F0; c <= 0x10FFFF; ++c) // E01F0..10FFFF; C
+    assert(std::__escaped_output_table::__needs_escape(c) == true);
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/utilities/no_destroy.pass.cpp b/libcxx/test/libcxx/utilities/no_destroy.pass.cpp
new file mode 100644
index 000000000000..9a874a640753
--- /dev/null
+++ b/libcxx/test/libcxx/utilities/no_destroy.pass.cpp
@@ -0,0 +1,31 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <__utility/no_destroy.h>
+#include <cassert>
+
+#include "test_macros.h"
+
+#if TEST_STD_VER > 17
+// Test constexpr-constructibility.
+constinit std::__no_destroy<int> nd_int_const(std::__uninitialized_tag{});
+#endif
+
+struct DestroyLast {
+  ~DestroyLast() { assert(*ptr == 5); }
+
+  int* ptr;
+} last;
+
+static std::__no_destroy<int> nd_int(5);
+
+int main(int, char**) {
+  last.ptr = &nd_int.__get();
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/sequences/array/size_and_alignment.compile.pass.cpp b/libcxx/test/std/containers/sequences/array/size_and_alignment.compile.pass.cpp
index 209e24964807..7ba56577d1bb 100644
--- a/libcxx/test/std/containers/sequences/array/size_and_alignment.compile.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/size_and_alignment.compile.pass.cpp
@@ -46,7 +46,7 @@ void test_type() {
     static_assert(!std::is_empty<Array>::value, "");
 
     // Make sure empty arrays don't have padding bytes
-    LIBCPP_STATIC_ASSERT(std::__libcpp_datasizeof<Array>::value == sizeof(Array), "");
+    LIBCPP_STATIC_ASSERT(std::__datasizeof_v<Array> == sizeof(Array), "");
   }
 
   {
diff --git a/libcxx/test/std/language.support/support.dynamic/hardware_inference_size.compile.pass.cpp b/libcxx/test/std/language.support/support.dynamic/hardware_inference_size.compile.pass.cpp
index ae277d53e46f..2656f0595bf5 100644
--- a/libcxx/test/std/language.support/support.dynamic/hardware_inference_size.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/hardware_inference_size.compile.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// XFAIL: (clang || apple-clang) && stdlib=libc++
+// UNSUPPORTED: (clang || apple-clang) && stdlib=libc++
 
 #include <new>
 
diff --git a/libcxx/test/std/ranges/range.utility/range.utility.conv/to.pass.cpp b/libcxx/test/std/ranges/range.utility/range.utility.conv/to.pass.cpp
index 3df88d6a2dcc..7f816bb21a19 100644
--- a/libcxx/test/std/ranges/range.utility/range.utility.conv/to.pass.cpp
+++ b/libcxx/test/std/ranges/range.utility/range.utility.conv/to.pass.cpp
@@ -560,6 +560,11 @@ constexpr void test_recursive() {
   }
 
   assert((in | std::ranges::to<C4>()) == result);
+
+  // LWG3984: ranges::to's recursion branch may be ill-formed
+  auto in_owning_view = std::views::all(std::move(in));
+  static_assert(!std::ranges::viewable_range<decltype((in_owning_view))>);
+  assert(std::ranges::to<C4>(in_owning_view) == result);
 }
 
 constexpr bool test() {
diff --git a/libcxx/utils/generate_escaped_output_table.py b/libcxx/utils/generate_escaped_output_table.py
index a11ce259096d..523a0be3a451 100755
--- a/libcxx/utils/generate_escaped_output_table.py
+++ b/libcxx/utils/generate_escaped_output_table.py
@@ -113,34 +113,41 @@ DATA_ARRAY_TEMPLATE = """
 /// table lacks a property, thus having more bits available for the size.
 ///
 /// The data has 2 values:
-/// - bits [0, 10] The size of the range, allowing 2048 elements.
-/// - bits [11, 31] The lower bound code point of the range. The upper bound of
-///   the range is lower bound + size.
+/// - bits [0, 13] The size of the range, allowing 16384 elements.
+/// - bits [14, 31] The lower bound code point of the range. The upper bound of
+///   the range is lower bound + size. Note the code expects code units the fit
+///   into 18 bits, instead of the 21 bits needed for the full Unicode range.
 _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[{size}] = {{
 {entries}}};
 
+/// Returns whether the code unit needs to be escaped.
+///
 /// At the end of the valid Unicode code points space a lot of code points are
 /// either reserved or a noncharacter. Adding all these entries to the
-/// lookup table would add 446 entries to the table (in Unicode 14).
-/// Instead the only the start of the region is stored, every code point in
-/// this region needs to be escaped.
-_LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __unallocated_region_lower_bound = 0x{unallocated:08x};
+/// lookup table would greatly increase the size of the table. Instead these
+/// entries are manually processed. In this large area of reserved code points,
+/// there is a small area of extended graphemes that should not be escaped
+/// unconditionally. This is also manually coded. See the generation script for
+/// more details.
 
-/// Returns whether the code unit needs to be escaped.
 ///
 /// \pre The code point is a valid Unicode code point.
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __needs_escape(const char32_t __code_point) noexcept {{
-  // Since __unallocated_region_lower_bound contains the unshifted range do the
-  // comparison without shifting.
-  if (__code_point >= __unallocated_region_lower_bound)
+
+  // The entries in the gap at the end.
+  if(__code_point >= 0x{gap_lower:08x} && __code_point <= 0x{gap_upper:08x})
+     return false;
+
+  // The entries at the end.
+  if (__code_point >= 0x{unallocated:08x})
     return true;
 
-  ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) | 0x7ffu) - __entries;
+  ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 14) | 0x3fffu) - __entries;
   if (__i == 0)
     return false;
 
   --__i;
-  uint32_t __upper_bound = (__entries[__i] >> 11) + (__entries[__i] & 0x7ffu);
+  uint32_t __upper_bound = (__entries[__i] >> 14) + (__entries[__i] & 0x3fffu);
   return __code_point <= __upper_bound;
 }}
 """
@@ -245,28 +252,33 @@ def property_ranges_to_table(ranges: list[PropertyRange]) -> list[Entry]:
 
         while True:
             e = Entry(range.lower, range.upper - range.lower)
-            if e.offset <= 2047:
+            if e.offset <= 16383:
                 result.append(e)
                 break
-            e.offset = 2047
+            e.offset = 16383
             result.append(e)
-            range.lower += 2048
+            range.lower += 16384
     return result
 
 
 cpp_entrytemplate = "    0x{:08x} /* {:08x} - {:08x} [{:>5}] */"
 
 
-def generate_cpp_data(ranges: list[PropertyRange], unallocated: int) -> str:
+def generate_cpp_data(
+    ranges: list[PropertyRange], unallocated: int, gap_lower: int, gap_upper: int
+) -> str:
     result = StringIO()
     table = property_ranges_to_table(ranges)
+    # Validates all entries fit in 18 bits.
+    for x in table:
+        assert x.lower + x.offset < 0x3FFFF
     result.write(
         DATA_ARRAY_TEMPLATE.format(
             size=len(table),
             entries=",\n".join(
                 [
                     cpp_entrytemplate.format(
-                        x.lower << 11 | x.offset,
+                        x.lower << 14 | x.offset,
                         x.lower,
                         x.lower + x.offset,
                         x.offset + 1,
@@ -275,6 +287,8 @@ def generate_cpp_data(ranges: list[PropertyRange], unallocated: int) -> str:
                 ]
             ),
             unallocated=unallocated,
+            gap_lower=gap_lower,
+            gap_upper=gap_upper,
         )
     )
 
@@ -305,22 +319,28 @@ def generate_data_tables() -> str:
 
     data = compactPropertyRanges(sorted(properties, key=lambda x: x.lower))
 
-    # The last entry is large. In Unicode 14 it contains the entries
-    # 3134B..0FFFF 912564 elements
-    # This are 446 entries of 1325 entries in the table.
-    # Based on the nature of these entries it is expected they remain for the
-    # forseeable future. Therefore we only store the lower bound of this section.
-    #
-    # When this region becomes substantially smaller we need to investigate
-    # this design.
-    #
-    # Due to P2713R1 Escaping improvements in std::format the range
+    # The output table has two large entries at the end, with a small "gap"
     #   E0100..E01EF  ; Grapheme_Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-    # is no longer part of these entries. This causes an increase in the size
-    # of the table.
-    assert data[-1].upper == 0x10FFFF
-
-    return "\n".join([generate_cpp_data(data[:-1], data[-1].lower)])
+    # Based on Unicode 15.1.0:
+    # - Encoding all these entries in the table requires 1173 entries.
+    # - Manually handling these last two blocks reduces the size to 729 entries.
+    # This not only reduces the binary size, but also improves the performance
+    # by having fewer elements to search.
+    # The exact entries may differ between Unicode versions. When these numbers
+    # change the test needs to be updated too.
+    #   libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
+    assert (data[-2].lower) == 0x323B0
+    assert (data[-2].upper) == 0xE00FF
+    assert (data[-1].lower) == 0xE01F0
+    assert (data[-1].upper) == 0x10FFFF
+
+    return "\n".join(
+        [
+            generate_cpp_data(
+                data[:-2], data[-2].lower, data[-2].upper + 1, data[-1].lower - 1
+            )
+        ]
+    )
 
 
 if __name__ == "__main__":
diff --git a/lld/COFF/MinGW.cpp b/lld/COFF/MinGW.cpp
index e46f5277a8c3..29c01da9e28f 100644
--- a/lld/COFF/MinGW.cpp
+++ b/lld/COFF/MinGW.cpp
@@ -50,7 +50,6 @@ AutoExporter::AutoExporter(
       "libclang_rt.profile-x86_64",
       "libc++",
       "libc++abi",
-      "libFortran_main",
       "libFortranRuntime",
       "libFortranDecimal",
       "libunwind",
diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp
index 20088d92bafa..e4d63250135e 100644
--- a/lld/ELF/Arch/RISCV.cpp
+++ b/lld/ELF/Arch/RISCV.cpp
@@ -1057,7 +1057,7 @@ public:
 };
 } // namespace
 
-static void mergeArch(RISCVISAInfo::OrderedExtensionMap &mergedExts,
+static void mergeArch(RISCVISAUtils::OrderedExtensionMap &mergedExts,
                       unsigned &mergedXlen, const InputSectionBase *sec,
                       StringRef s) {
   auto maybeInfo = RISCVISAInfo::parseNormalizedArchString(s);
@@ -1086,7 +1086,7 @@ static void mergeArch(RISCVISAInfo::OrderedExtensionMap &mergedExts,
 
 static RISCVAttributesSection *
 mergeAttributesSection(const SmallVector<InputSectionBase *, 0> &sections) {
-  RISCVISAInfo::OrderedExtensionMap exts;
+  RISCVISAUtils::OrderedExtensionMap exts;
   const InputSectionBase *firstStackAlign = nullptr;
   unsigned firstStackAlignValue = 0, xlen = 0;
   bool hasArch = false;
diff --git a/lld/test/ELF/riscv-branch.s b/lld/test/ELF/riscv-branch.s
index dbf39dc0bb8f..1a2b446b5a43 100644
--- a/lld/test/ELF/riscv-branch.s
+++ b/lld/test/ELF/riscv-branch.s
@@ -7,19 +7,19 @@
 # RUN: ld.lld %t.rv64.o --defsym foo=_start+4 --defsym bar=_start -o %t.rv64
 # RUN: llvm-objdump -d %t.rv32 | FileCheck %s --check-prefix=CHECK-32
 # RUN: llvm-objdump -d %t.rv64 | FileCheck %s --check-prefix=CHECK-64
-# CHECK-32: 63 02 00 00     beqz    zero, 0x110b8
-# CHECK-32: e3 1e 00 fe     bnez    zero, 0x110b4
-# CHECK-64: 63 02 00 00     beqz    zero, 0x11124
-# CHECK-64: e3 1e 00 fe     bnez    zero, 0x11120
+# CHECK-32: 00000263     beqz    zero, 0x110b8
+# CHECK-32: fe001ee3     bnez    zero, 0x110b4
+# CHECK-64: 00000263     beqz    zero, 0x11124
+# CHECK-64: fe001ee3     bnez    zero, 0x11120
 #
 # RUN: ld.lld %t.rv32.o --defsym foo=_start+0xffe --defsym bar=_start+4-0x1000 -o %t.rv32.limits
 # RUN: ld.lld %t.rv64.o --defsym foo=_start+0xffe --defsym bar=_start+4-0x1000 -o %t.rv64.limits
 # RUN: llvm-objdump -d %t.rv32.limits | FileCheck --check-prefix=LIMITS-32 %s
 # RUN: llvm-objdump -d %t.rv64.limits | FileCheck --check-prefix=LIMITS-64 %s
-# LIMITS-32:      e3 0f 00 7e     beqz    zero, 0x120b2
-# LIMITS-32-NEXT: 63 10 00 80     bnez    zero, 0x100b8
-# LIMITS-64:      e3 0f 00 7e     beqz    zero, 0x1211e
-# LIMITS-64-NEXT: 63 10 00 80     bnez    zero, 0x10124
+# LIMITS-32:      7e000fe3     beqz    zero, 0x120b2
+# LIMITS-32-NEXT: 80001063     bnez    zero, 0x100b8
+# LIMITS-64:      7e000fe3     beqz    zero, 0x1211e
+# LIMITS-64-NEXT: 80001063     bnez    zero, 0x10124
 
 # RUN: not ld.lld %t.rv32.o --defsym foo=_start+0x1000 --defsym bar=_start+4-0x1002 -o /dev/null 2>&1 | FileCheck --check-prefix=ERROR-RANGE %s
 # RUN: not ld.lld %t.rv64.o --defsym foo=_start+0x1000 --defsym bar=_start+4-0x1002 -o /dev/null 2>&1 | FileCheck --check-prefix=ERROR-RANGE %s
diff --git a/lld/test/ELF/riscv-call.s b/lld/test/ELF/riscv-call.s
index 5fef156df0bb..0e81e9b4710e 100644
--- a/lld/test/ELF/riscv-call.s
+++ b/lld/test/ELF/riscv-call.s
@@ -7,19 +7,19 @@
 # RUN: ld.lld %t.rv64.o --defsym foo=_start+8 --defsym bar=_start -o %t.rv64
 # RUN: llvm-objdump -d %t.rv32 | FileCheck %s
 # RUN: llvm-objdump -d %t.rv64 | FileCheck %s
-# CHECK:      97 00 00 00     auipc   ra, 0x0
-# CHECK-NEXT: e7 80 80 00     jalr    0x8(ra)
-# CHECK:      97 00 00 00     auipc   ra, 0x0
-# CHECK-NEXT: e7 80 80 ff     jalr    -0x8(ra)
+# CHECK:      00000097     auipc   ra, 0x0
+# CHECK-NEXT: 008080e7     jalr    0x8(ra)
+# CHECK:      00000097     auipc   ra, 0x0
+# CHECK-NEXT: ff8080e7     jalr    -0x8(ra)
 
 # RUN: ld.lld %t.rv32.o --defsym foo=_start+0x7ffff7ff --defsym bar=_start+8-0x80000800 -o %t.rv32.limits
 # RUN: ld.lld %t.rv64.o --defsym foo=_start+0x7ffff7ff --defsym bar=_start+8-0x80000800 -o %t.rv64.limits
 # RUN: llvm-objdump -d %t.rv32.limits | FileCheck --check-prefix=LIMITS %s
 # RUN: llvm-objdump -d %t.rv64.limits | FileCheck --check-prefix=LIMITS %s
-# LIMITS:      97 f0 ff 7f     auipc   ra, 0x7ffff
-# LIMITS-NEXT: e7 80 f0 7f     jalr    0x7ff(ra)
-# LIMITS-NEXT: 97 00 00 80     auipc   ra, 0x80000
-# LIMITS-NEXT: e7 80 00 80     jalr    -0x800(ra)
+# LIMITS:      7ffff097     auipc   ra, 0x7ffff
+# LIMITS-NEXT: 7ff080e7     jalr    0x7ff(ra)
+# LIMITS-NEXT: 80000097     auipc   ra, 0x80000
+# LIMITS-NEXT: 800080e7     jalr    -0x800(ra)
 
 # RUN: ld.lld %t.rv32.o --defsym foo=_start+0x7ffff800 --defsym bar=_start+8-0x80000801 -o %t
 # RUN: not ld.lld %t.rv64.o --defsym foo=_start+0x7ffff800 --defsym bar=_start+8-0x80000801 -o /dev/null 2>&1 | \
diff --git a/lld/test/ELF/riscv-hi20-lo12.s b/lld/test/ELF/riscv-hi20-lo12.s
index 85861432db0b..b9786f563f28 100644
--- a/lld/test/ELF/riscv-hi20-lo12.s
+++ b/lld/test/ELF/riscv-hi20-lo12.s
@@ -7,23 +7,23 @@
 # RUN: ld.lld %t.rv64.o --defsym foo=0 --defsym bar=42 -o %t.rv64
 # RUN: llvm-objdump -d %t.rv32 | FileCheck %s
 # RUN: llvm-objdump -d %t.rv64 | FileCheck %s
-# CHECK:      37 05 00 00     lui     a0, 0x0
-# CHECK-NEXT: 13 05 05 00     mv      a0, a0
-# CHECK-NEXT: 23 20 a5 00     sw      a0, 0x0(a0)
-# CHECK-NEXT: b7 05 00 00     lui     a1, 0x0
-# CHECK-NEXT: 93 85 a5 02     addi    a1, a1, 0x2a
-# CHECK-NEXT: 23 a5 b5 02     sw      a1, 0x2a(a1)
+# CHECK:      00000537     lui     a0, 0x0
+# CHECK-NEXT: 00050513     mv      a0, a0
+# CHECK-NEXT: 00a52023     sw      a0, 0x0(a0)
+# CHECK-NEXT: 000005b7     lui     a1, 0x0
+# CHECK-NEXT: 02a58593     addi    a1, a1, 0x2a
+# CHECK-NEXT: 02b5a523     sw      a1, 0x2a(a1)
 
 # RUN: ld.lld %t.rv32.o --defsym foo=0x7ffff7ff --defsym bar=0x7ffff800 -o %t.rv32.limits
 # RUN: ld.lld %t.rv64.o --defsym foo=0x7ffff7ff --defsym bar=0xffffffff7ffff800 -o %t.rv64.limits
 # RUN: llvm-objdump -d %t.rv32.limits | FileCheck --check-prefix=LIMITS %s
 # RUN: llvm-objdump -d %t.rv64.limits | FileCheck --check-prefix=LIMITS %s
-# LIMITS:      37 f5 ff 7f     lui     a0, 0x7ffff
-# LIMITS-NEXT: 13 05 f5 7f     addi    a0, a0, 0x7ff
-# LIMITS-NEXT: a3 2f a5 7e     sw      a0, 0x7ff(a0)
-# LIMITS-NEXT: b7 05 00 80     lui     a1, 0x80000
-# LIMITS-NEXT: 93 85 05 80     addi    a1, a1, -0x800
-# LIMITS-NEXT: 23 a0 b5 80     sw      a1, -0x800(a1)
+# LIMITS:      7ffff537     lui     a0, 0x7ffff
+# LIMITS-NEXT: 7ff50513     addi    a0, a0, 0x7ff
+# LIMITS-NEXT: 7ea52fa3     sw      a0, 0x7ff(a0)
+# LIMITS-NEXT: 800005b7     lui     a1, 0x80000
+# LIMITS-NEXT: 80058593     addi    a1, a1, -0x800
+# LIMITS-NEXT: 80b5a023     sw      a1, -0x800(a1)
 
 # RUN: not ld.lld %t.rv64.o --defsym foo=0x7ffff800 --defsym bar=0xffffffff7ffff7ff -o /dev/null 2>&1 | FileCheck --check-prefix ERROR %s
 # ERROR: relocation R_RISCV_HI20 out of range: 524288 is not in [-524288, 524287]; references 'foo'
diff --git a/lld/test/ELF/riscv-jal.s b/lld/test/ELF/riscv-jal.s
index cd3b842aad60..2129e4454706 100644
--- a/lld/test/ELF/riscv-jal.s
+++ b/lld/test/ELF/riscv-jal.s
@@ -7,19 +7,19 @@
 # RUN: ld.lld %t.rv64.o --defsym foo=_start+4 --defsym bar=_start -o %t.rv64
 # RUN: llvm-objdump -d %t.rv32 | FileCheck %s --check-prefix=CHECK-32
 # RUN: llvm-objdump -d %t.rv64 | FileCheck %s --check-prefix=CHECK-64
-# CHECK-32: 6f 00 40 00    j   0x110b8
-# CHECK-32: ef f0 df ff    jal 0x110b4
-# CHECK-64: 6f 00 40 00    j   0x11124
-# CHECK-64: ef f0 df ff    jal 0x11120
+# CHECK-32: 0040006f    j   0x110b8
+# CHECK-32: ffdff0ef    jal 0x110b4
+# CHECK-64: 0040006f    j   0x11124
+# CHECK-64: ffdff0ef    jal 0x11120
 
 # RUN: ld.lld %t.rv32.o --defsym foo=_start+0xffffe --defsym bar=_start+4-0x100000 -o %t.rv32.limits
 # RUN: ld.lld %t.rv64.o --defsym foo=_start+0xffffe --defsym bar=_start+4-0x100000 -o %t.rv64.limits
 # RUN: llvm-objdump -d %t.rv32.limits | FileCheck --check-prefix=LIMITS-32 %s
 # RUN: llvm-objdump -d %t.rv64.limits | FileCheck --check-prefix=LIMITS-64 %s
-# LIMITS-32:      6f f0 ff 7f j   0x1110b2
-# LIMITS-32-NEXT: ef 00 00 80 jal 0xfff110b8
-# LIMITS-64:      6f f0 ff 7f j   0x11111e
-# LIMITS-64-NEXT: ef 00 00 80 jal 0xfffffffffff11124
+# LIMITS-32:      7ffff06f j   0x1110b2
+# LIMITS-32-NEXT: 800000ef jal 0xfff110b8
+# LIMITS-64:      7ffff06f j   0x11111e
+# LIMITS-64-NEXT: 800000ef jal 0xfffffffffff11124
 
 # RUN: not ld.lld %t.rv32.o --defsym foo=_start+0x100000 --defsym bar=_start+4-0x100002 -o /dev/null 2>&1 | FileCheck --check-prefix=ERROR-RANGE %s
 # RUN: not ld.lld %t.rv64.o --defsym foo=_start+0x100000 --defsym bar=_start+4-0x100002 -o /dev/null 2>&1 | FileCheck --check-prefix=ERROR-RANGE %s
diff --git a/lld/test/wasm/init-fini.ll b/lld/test/wasm/init-fini.ll
index 3d2e9a78043e..ef2f41f96e89 100644
--- a/lld/test/wasm/init-fini.ll
+++ b/lld/test/wasm/init-fini.ll
@@ -78,7 +78,7 @@ entry:
 ; CHECK-NEXT:         Body:            10041005100A100F1012100F10141004100C100F10161002100E0B
 ; CHECK:            - Index:           22
 ; CHECK-NEXT:         Locals:
-; CHECK-NEXT:         Body:            02404186808080004100418088808000108080808000450D0000000B0B
+; CHECK-NEXT:         Body:            02404186808080004100418088808000108080808000450D00000B0B
 ; CHECK-NEXT:   - Type:            CUSTOM
 ; CHECK-NEXT:     Name:            name
 ; CHECK-NEXT:     FunctionNames:
diff --git a/lldb/bindings/interface/SBValueDocstrings.i b/lldb/bindings/interface/SBValueDocstrings.i
index 6bab923e8b35..59fa807f5ec9 100644
--- a/lldb/bindings/interface/SBValueDocstrings.i
+++ b/lldb/bindings/interface/SBValueDocstrings.i
@@ -135,6 +135,26 @@ linked list."
 %feature("docstring", "Expands nested expressions like .a->b[0].c[1]->d."
 ) lldb::SBValue::GetValueForExpressionPath;
 
+%feature("docstring", "
+      Return the value as an address.  On failure, LLDB_INVALID_ADDRESS
+      will be returned.  On architectures like AArch64, where the
+      top (unaddressable) bits can be used for authentication,
+      memory tagging, or top byte ignore,  this method will return
+      the value with those top bits cleared.
+
+      GetValueAsUnsigned returns the actual value, with the
+      authentication/Top Byte Ignore/Memory Tagging Extension bits.
+
+      Calling this on a random value which is not a pointer is
+      incorrect.  Call GetType().IsPointerType() if in doubt.
+
+      An SB API program may want to show both the literal byte value
+      and the address it refers to in memory.  These two SBValue
+      methods allow SB API writers to behave appropriately for their
+      interface."
+) lldb::SBValue::GetValueAsAddress;
+
+
 %feature("doctstring", "
     Returns the number for children.
 
diff --git a/lldb/docs/resources/lldbgdbremote.md b/lldb/docs/resources/lldbgdbremote.md
index cbe5c766d61e..a9fa2a432b70 100644
--- a/lldb/docs/resources/lldbgdbremote.md
+++ b/lldb/docs/resources/lldbgdbremote.md
@@ -27,18 +27,8 @@ standard GDB remote protocol packets.
 
 ## QStartNoAckMode
 
-### Brief
-
 Try to enable no ACK mode to skip sending ACKs and NACKs.
 
-### Priority To Implement
-
-High. Any GDB remote server that can implement this should if the
-connection is reliable. This improves packet throughput and increases
-the performance of the connection.
-
-### Description
-
 Having to send an ACK/NACK after every packet slows things down a bit, so we
 have a way to disable ACK packets to minimize the traffic for reliable
 communication interfaces (like sockets). Below GDB or LLDB will send this
@@ -52,17 +42,15 @@ read packet: $OK#9a
 send packet: +
 ```
 
-## QSupported
-
-### Brief
-
-Query the GDB remote server for features it supports
-
 ### Priority To Implement
 
-Optional.
+High. Any GDB remote server that can implement this should if the
+connection is reliable. This improves packet throughput and increases
+the performance of the connection.
+
+## QSupported
 
-### Description
+Query the GDB remote server for features it supports
 
 QSupported is a standard GDB Remote Serial Protocol packet, but
 there are several additions to the response that lldb can parse.
@@ -96,21 +84,14 @@ In the example above, three lldb extensions are shown:
       watchpoints, up to a pointer size, `sizeof(void*)`, a reasonable
       baseline assumption.
 
+### Priority To Implement
 
-## "A" - launch args packet
+Optional.
 
-### Brief
+## "A" - launch args packet
 
 Launch a program using the supplied arguments
 
-### Priority To Implement
-
-Low. Only needed if the remote target wants to launch a target after
-making a connection to a GDB server that isn't already connected to
-an inferior process.
-
-### Description
-
 We have added support for the "set program arguments" packet where we can
 start a connection to a remote server and then later supply the path to the
 executable and the arguments to use when executing:
@@ -130,10 +111,25 @@ The above packet helps when you have remote debugging abilities where you
 could launch a process on a remote host, this isn't needed for bare board
 debugging.
 
+### Priority To Implement
+
+Low. Only needed if the remote target wants to launch a target after
+making a connection to a GDB server that isn't already connected to
+an inferior process.
+
+## qLaunchSuccess
 
-## QEnvironment:NAME=VALUE
+Check whether launching a process with the `A` packet succeeded.
+
+Returns the status of the last attempt to launch a process.
+Either `OK` if no error ocurred, or `E` followed by a string
+describing the error.
+
+### Priority To Implement
+
+High, launching processes is a key part of LLDB's platform mode.
 
-### Brief
+## QEnvironment:NAME=VALUE
 
 Setup the environment up for a new child process that will soon be
 launched using the "A" packet.
@@ -146,14 +142,6 @@ scan the environment strings before sending, prefer
 the `QEnvironmentHexEncoded` packet over `QEnvironment`, if it is
 available.
 
-### Priority To Implement
-
-Low. Only needed if the remote target wants to launch a target after
-making a connection to a GDB server that isn't already connected to
-an inferior process.
-
-### Description
-
 Both GDB and LLDB support passing down environment variables. Is it ok to
 respond with a `$#00` (unimplemented):
 ```
@@ -162,9 +150,13 @@ read packet: $OK#00
 ```
 This packet can be sent one or more times _prior_ to sending a "A" packet.
 
-## QEnvironmentHexEncoded:HEX-ENCODING(NAME=VALUE)
+### Priority To Implement
+
+Low. Only needed if the remote target wants to launch a target after
+making a connection to a GDB server that isn't already connected to
+an inferior process.
 
-### Brief
+## QEnvironmentHexEncoded:HEX-ENCODING(NAME=VALUE)
 
 Setup the environment up for a new child process that will soon be
 launched using the "A" packet.
@@ -173,14 +165,6 @@ The only difference between this packet and `QEnvironment` is that the
 environment key-value pair is ascii hex encoded for transmission.
 This allows values with gdb-remote metacharacters like `#` to be sent.
 
-### Priority To Implement
-
-Low. Only needed if the remote target wants to launch a target after
-making a connection to a GDB server that isn't already connected to
-an inferior process.
-
-### Description
-
 Both GDB and LLDB support passing down environment variables. Is it ok to
 respond with a `$#00` (unimplemented):
 ```
@@ -189,9 +173,13 @@ read packet: $OK#00
 ```
 This packet can be sent one or more times _prior_ to sending a "A" packet.
 
-## QEnableErrorStrings
+### Priority To Implement
+
+Low. Only needed if the remote target wants to launch a target after
+making a connection to a GDB server that isn't already connected to
+an inferior process.
 
-### Brief
+## QEnableErrorStrings
 
 This packet enables reporting of Error strings in remote packet
 replies from the server to client. If the server supports this
@@ -221,19 +209,9 @@ read packet: $OK#00
 
 ## QSetSTDIN:\<ascii-hex-path\> / QSetSTDOUT:\<ascii-hex-path\> / QSetSTDERR:\<ascii-hex-path\>
 
-### Brief
-
 Setup where STDIN, STDOUT, and STDERR go prior to sending an "A"
 packet.
 
-### Priority To Implement
-
-Low. Only needed if the remote target wants to launch a target after
-making a connection to a GDB server that isn't already connected to
-an inferior process.
-
-### Description
-
 When launching a program through the GDB remote protocol with the "A" packet,
 you might also want to specify where stdin/out/err go:
 ```
@@ -243,19 +221,16 @@ QSetSTDERR:<ascii-hex-path>
 ```
 These packets must be sent  _prior_ to sending a "A" packet.
 
-## QSetWorkingDir:\<ascii-hex-path\>
-
-### Brief
-
-Set the working directory prior to sending an "A" packet.
-
 ### Priority To Implement
 
 Low. Only needed if the remote target wants to launch a target after
 making a connection to a GDB server that isn't already connected to
 an inferior process.
 
-### Description
+
+## QSetWorkingDir:\<ascii-hex-path\>
+
+Set the working directory prior to sending an "A" packet.
 
 Or specify the working directory:
 ```
@@ -263,20 +238,27 @@ QSetWorkingDir:<ascii-hex-path>
 ```
 This packet must be sent  _prior_ to sending a "A" packet.
 
-## QSetDisableASLR:\<bool\>
-
-### Brief
-
-Enable or disable ASLR on the next "A" packet.
-
 ### Priority To Implement
 
 Low. Only needed if the remote target wants to launch a target after
 making a connection to a GDB server that isn't already connected to
-an inferior process and if the target supports disabling ASLR
-(Address space layout randomization).
+an inferior process.
 
-### Description
+## qGetWorkingDir
+
+Get the current working directory of the platform stub in
+ASCII hex encoding.
+
+### Example
+
+```
+receive: qGetWorkingDir
+send:    2f4170706c65496e7465726e616c2f6c6c64622f73657474696e67732f342f5465737453657474696e67732e746573745f646973617373656d626c65725f73657474696e6773
+```
+
+## QSetDisableASLR:\<bool\>
+
+Enable or disable ASLR on the next "A" packet.
 
 Or control if ASLR is enabled/disabled:
 ```
@@ -288,9 +270,14 @@ read packet: OK
 ```
 This packet must be sent  _prior_ to sending a "A" packet.
 
-## QListThreadsInStopReply
+### Priority To Implement
+
+Low. Only needed if the remote target wants to launch a target after
+making a connection to a GDB server that isn't already connected to
+an inferior process and if the target supports disabling ASLR
+(Address space layout randomization).
 
-### Brief
+## QListThreadsInStopReply
 
 Enable the `threads:` and `thread-pcs:` data in the question-mark packet
 ("T packet") responses when the stub reports that a program has
@@ -312,8 +299,6 @@ read packet: OK
 
 ## jLLDBTraceSupported
 
-### Brief
-
 Get the processor tracing type supported by the gdb-server for the current
 inferior. Responses might be different depending on the architecture and
 capabilities of the underlying OS.
@@ -347,8 +332,6 @@ read packet: {"name":<name>, "description":<description>}/E<error code>;AAAAAAAA
 
 ## jLLDBTraceStart
 
-### Brief
-
 Start tracing a process or its threads using a provided tracing technology.
 The input and output are specified as JSON objects. In case of success, an OK
 response is returned, or an error otherwise.
@@ -501,8 +484,6 @@ read packet: OK/E<error code>;AAAAAAAAA
 
 ## jLLDBTraceStop
 
-### Brief
-
 Stop tracing a process or its threads using a provided tracing technology.
 The input and output are specified as JSON objects. In case of success, an OK
 response is returned, or an error otherwise.
@@ -554,8 +535,6 @@ read packet: OK/E<error code>;AAAAAAAAA
 
 ## jLLDBTraceGetState
 
-### Brief
-
 Get the current state of the process and its threads being traced by
 a given trace technology. The response is a JSON object with custom
 information depending on the trace technology. In case of errors, an
@@ -661,8 +640,6 @@ read packet: {...object}/E<error code>;AAAAAAAAA
 
 ## jLLDBTraceGetBinaryData
 
-### Brief
-
 Get binary data given a trace technology and a data identifier.
 The input is specified as a JSON object and the response has the same format
 as the "binary memory read" (aka "x") packet. In case of failures, an error
@@ -693,29 +670,8 @@ read packet: <binary data>/E<error code>;AAAAAAAAA
 
 ## qRegisterInfo\<hex-reg-id\>
 
-### Brief
-
 Discover register information from the remote GDB server.
 
-### Priority To Implement
-
-High. Any target that can self describe its registers, should do so.
-This means if new registers are ever added to a remote target, they
-will get picked up automatically, and allows registers to change
-depending on the actual CPU type that is used.
-
-NB: `qRegisterInfo` is deprecated in favor of the standard gdb remote
-serial protocol register description method,
-`qXfer:features:read:target.xml`.
-If `qXfer:features:read:target.xml` is supported, `qRegisterInfo` does
-not need to be implemented.  The target.xml format is used by most
-gdb RSP stubs whereas `qRegisterInfo` was an lldb-only design.
-`qRegisterInfo` requires one packet per register and can have undesirable
-performance costs at the start of a debug session, whereas target.xml
-may be able to describe all registers in a single packet.
-
-### Description
-
 With LLDB, for register information, remote GDB servers can add
 support for the "qRegisterInfoN" packet where "N" is a zero based
 base 16 register number that must start at zero and increase by one
@@ -981,19 +937,26 @@ The keys and values are detailed below:
   modifying the CPSR register can cause the r8 - r14 and cpsr value to
   change depending on if the mode has changed.
 
+### Priority To Implement
 
-## qPlatform_shell
-
-### Brief
-
-Run a command in a shell on the connected remote machine.
+High. Any target that can self describe its registers, should do so.
+This means if new registers are ever added to a remote target, they
+will get picked up automatically, and allows registers to change
+depending on the actual CPU type that is used.
 
-### Priority To Implement
+NB: `qRegisterInfo` is deprecated in favor of the standard gdb remote
+serial protocol register description method,
+`qXfer:features:read:target.xml`.
+If `qXfer:features:read:target.xml` is supported, `qRegisterInfo` does
+not need to be implemented.  The target.xml format is used by most
+gdb RSP stubs whereas `qRegisterInfo` was an lldb-only design.
+`qRegisterInfo` requires one packet per register and can have undesirable
+performance costs at the start of a debug session, whereas target.xml
+may be able to describe all registers in a single packet.
 
-High. This command allows LLDB clients to run arbitrary shell
-commands on a remote host.
+## qPlatform_shell
 
-### Description
+Run a command in a shell on the connected remote machine.
 
 The request consists of the command to be executed encoded in ASCII characters
 converted into hex bytes.
@@ -1014,40 +977,35 @@ drwxrwxr-x  5 username groupname    4096 Aug 15 21:36 source.cpp
 -rw-r--r--  1 username groupname    3190 Aug 12 16:46 Makefile
 ```
 
-## qPlatform_mkdir
-
-### Brief
-
-Creates a new directory on the connected remote machine.
-
 ### Priority To Implement
 
-Low. This command allows LLDB clients to create new directories on
-a remote host.
+High. This command allows LLDB clients to run arbitrary shell
+commands on a remote host.
+
+## qPlatform_mkdir
 
-### Description
+Creates a new directory on the connected remote machine.
 
 Request: `qPlatform_mkdir:<hex-file-mode>,<ascii-hex-path>`
 
+The request packet has the fields:
+   1. mode bits in base 16
+   2. file path in ascii-hex encoding
+
 Reply: 
   * `F<mkdir-return-code>`
     (mkdir called successfully and returned with the given return code)
   * `Exx` (An error occurred)
 
+### Priority To Implement
 
-## qPlatform_chmod
+Low. This command allows LLDB clients to create new directories on
+a remote host.
 
-### Brief
+## vFile:chmod / qPlatform_chmod
 
 Change the permissions of a file on the connected remote machine.
 
-### Priority To Implement
-
-Low. This command allows LLDB clients to change the permissions of
-a file on the remote host.
-
-### Description
-
 Request: `qPlatform_chmod:<hex-file-mode>,<ascii-hex-path>`
 
 Reply:
@@ -1055,19 +1013,13 @@ Reply:
   (chmod called successfully and returned with the given return code)
 * `Exx` (An error occurred)
 
-## qHostInfo
-
-### Brief
-
-Get information about the host we are remotely connected to.
-
 ### Priority To Implement
 
-High. This packet is usually very easy to implement and can help
-LLDB select the correct plug-ins for the job based on the target
-triple information that is supplied.
+Low.
+
+## qHostInfo
 
-### Description
+Get information about the host we are remotely connected to.
 
 LLDB supports a host info call that gets all sorts of details of the system
 that is being debugged:
@@ -1114,20 +1066,16 @@ Key value pairs are one of:
   AArch64 can have different page table setups for low and high
   memory, and therefore a different number of bits used for addressing.
 
-## qGDBServerVersion
-
-### Brief
-
-Get version information about this implementation of the gdb-remote
-protocol.
-
 ### Priority To Implement
 
 High. This packet is usually very easy to implement and can help
-LLDB to work around bugs in a server's implementation when they
-are found.
+LLDB select the correct plug-ins for the job based on the target
+triple information that is supplied.
+
+## qGDBServerVersion
 
-### Description
+Get version information about this implementation of the gdb-remote
+protocol.
 
 The goal of this packet is to provide enough information about an
 implementation of the gdb-remote-protocol server that lldb can
@@ -1159,9 +1107,13 @@ Suggested key names:
 * `major_version`: major version number
 * `minor_version`: minor version number
 
-## qProcessInfo
+### Priority To Implement
 
-### Brief
+High. This packet is usually very easy to implement and can help
+LLDB to work around bugs in a server's implementation when they
+are found.
+
+## qProcessInfo
 
 Get information about the process we are currently debugging.
 
@@ -1178,8 +1130,6 @@ process to know what you're working with.
 
 All numeric fields return base 16 numbers without any "0x" prefix.
 
-### Description
-
 An i386 process:
 ```
 send packet: $qProcessInfo#00
@@ -1216,24 +1166,9 @@ Key value pairs include:
 
 ## qShlibInfoAddr
 
-### Brief
-
 Get an address where the dynamic linker stores information about
 where shared libraries are loaded.
 
-### Priority To Implement
-
-High if you have a dynamic loader plug-in in LLDB for your target
-triple (see the "qHostInfo" packet) that can use this information.
-Many times address load randomization can make it hard to detect
-where the dynamic loader binary and data structures are located and
-some platforms know, or can find out where this information is.
-
-Low if you have a debug target where all object and symbol files
-contain static load addresses.
-
-### Description
-
 LLDB and GDB both support the `qShlibInfoAddr` packet which is a hint to each
 debugger as to where to find the dynamic loader information. For darwin
 binaries that run in user land this is the address of the `all_image_infos`
@@ -1245,12 +1180,29 @@ send packet: $qShlibInfoAddr#00
 read packet: $7fff5fc40040#00
 ```
 
-## qThreadStopInfo\<tid\>
+### Priority To Implement
+
+High if you have a dynamic loader plug-in in LLDB for your target
+triple (see the "qHostInfo" packet) that can use this information.
+Many times address load randomization can make it hard to detect
+where the dynamic loader binary and data structures are located and
+some platforms know, or can find out where this information is.
+
+Low if you have a debug target where all object and symbol files
+contain static load addresses.
 
-### Brief
+## qThreadStopInfo\<tid\>
 
 Get information about why a thread, whose ID is `<tid>`, is stopped.
 
+LLDB tries to use the `qThreadStopInfo` packet which is formatted as
+`qThreadStopInfo%x` where `%x` is the hex thread ID. This requests information
+about why a thread is stopped. The response is the same as the stop reply
+packets and tells us what happened to the other threads. The standard GDB
+remote packets love to think that there is only _one_ reason that _one_ thread
+stops at a time. This allows us to see why all threads stopped and allows us
+to implement better multi-threaded debugging support.
+
 ### Priority To Implement
 
 High if you need to support multi-threaded or multi-core debugging.
@@ -1261,34 +1213,10 @@ threads (live system debug) / cores (JTAG) in your program have
 stopped and allows LLDB to display and control your program
 correctly.
 
-### Description
-
-LLDB tries to use the `qThreadStopInfo` packet which is formatted as
-`qThreadStopInfo%x` where `%x` is the hex thread ID. This requests information
-about why a thread is stopped. The response is the same as the stop reply
-packets and tells us what happened to the other threads. The standard GDB
-remote packets love to think that there is only _one_ reason that _one_ thread
-stops at a time. This allows us to see why all threads stopped and allows us
-to implement better multi-threaded debugging support.
-
 ## QThreadSuffixSupported
 
-### Brief
-
 Try to enable thread suffix support for the `g`, `G`, `p`, and `P` packets.
 
-### Priority To Implement
-
-High. Adding a thread suffix allows us to read and write registers
-more efficiently and stops us from having to select a thread with
-one packet and then read registers with a second packet. It also
-makes sure that no errors can occur where the debugger thinks it
-already has a thread selected (see the `Hg` packet from the standard
-GDB remote protocol documentation) yet the remote GDB server actually
-has another thread selected.
-
-### Description
-
 When reading thread registers, you currently need to set the current
 thread, then read the registers. This is kind of cumbersome, so we added the
 ability to query if the remote GDB server supports adding a `thread:<tid>;`
@@ -1326,21 +1254,20 @@ read packet: ....
 We also added support for allocating and deallocating memory. We use this to
 allocate memory so we can run JITed code.
 
-## _M\<size\>,\<permissions\>
-
-### Brief
-
-Allocate memory on the remote target with the specified size and
-permissions.
-
 ### Priority To Implement
 
-High if you want LLDB to be able to JIT code and run that code. JIT
-code also needs data which is also allocated and tracked.
+High. Adding a thread suffix allows us to read and write registers
+more efficiently and stops us from having to select a thread with
+one packet and then read registers with a second packet. It also
+makes sure that no errors can occur where the debugger thinks it
+already has a thread selected (see the `Hg` packet from the standard
+GDB remote protocol documentation) yet the remote GDB server actually
+has another thread selected.
 
-Low if you don't support running JIT'ed code.
+## _M\<size\>,\<permissions\>
 
-### Description
+Allocate memory on the remote target with the specified size and
+permissions.
 
 The allocate memory packet starts with `_M<size>,<permissions>`. It returns a
 raw big endian address value, or an empty response for unimplemented, or `EXX` for an error
@@ -1362,13 +1289,6 @@ You request a size and give the permissions. This packet does NOT need to be
 implemented if you don't want to support running JITed code. The return value
 is just the address of the newly allocated memory as raw big endian hex bytes.
 
-## _m\<addr\>
-
-### Brief
-
-Deallocate memory that was previously allocated using an allocate
-memory pack.
-
 ### Priority To Implement
 
 High if you want LLDB to be able to JIT code and run that code. JIT
@@ -1376,30 +1296,26 @@ code also needs data which is also allocated and tracked.
 
 Low if you don't support running JIT'ed code.
 
-### Description
+## _m\<addr\>
+
+Deallocate memory that was previously allocated using an allocate
+memory pack.
 
 The deallocate memory packet is `_m<addr>` where you pass in the address you
 got back from a previous call to the allocate memory packet. It returns `OK`
 if the memory was successfully deallocated, or `EXX`" for an error, or an
 empty response if not supported.
 
-## qMemoryRegionInfo:\<addr\>
-
-### Brief
+### Priority To Implement
 
-Get information about the address range that contains `<addr>`.
+High if you want LLDB to be able to JIT code and run that code. JIT
+code also needs data which is also allocated and tracked.
 
-### Priority To Implement
+Low if you don't support running JIT'ed code.
 
-Medium. This is nice to have, but it isn't necessary. It helps LLDB
-do stack unwinding when we branch into memory that isn't executable.
-If we can detect that the code we are stopped in isn't executable,
-then we can recover registers for stack frames above the current
-frame. Otherwise we must assume we are in some JIT'ed code (not JIT
-code that LLDB has made) and assume that no registers are available
-in higher stack frames.
+## qMemoryRegionInfo:\<addr\>
 
-### Description
+Get information about the address range that contains `<addr>`.
 
 We added a way to get information for a memory region. The packet is:
 ```
@@ -1455,9 +1371,17 @@ For instance, with a macOS process which has nothing mapped in the first
 The lack of `permissions:` indicates that none of read/write/execute are valid
 for this region.
 
-## "x" - Binary memory read
+### Priority To Implement
+
+Medium. This is nice to have, but it isn't necessary. It helps LLDB
+do stack unwinding when we branch into memory that isn't executable.
+If we can detect that the code we are stopped in isn't executable,
+then we can recover registers for stack frames above the current
+frame. Otherwise we must assume we are in some JIT'ed code (not JIT
+code that LLDB has made) and assume that no registers are available
+in higher stack frames.
 
-### Brief
+## "x" - Binary memory read
 
 Like the `m` (read) and `M` (write) packets, this is a partner to the
 `X` (write binary data) packet, `x`.
@@ -1491,8 +1415,6 @@ transport layer is assumed.
 
 ## Detach and stay stopped
 
-### Description
-
 We extended the "D" packet to specify that the monitor should keep the
 target suspended on detach.  The normal behavior is to resume execution
 on detach.  We will send:
@@ -1513,8 +1435,6 @@ D
 
 ## QSaveRegisterState / QSaveRegisterState;thread:XXXX;
 
-### Brief
-
 The `QSaveRegisterState` packet tells the remote debugserver to save
 all registers and return a non-zero unique integer ID that
 represents these save registers. If thread suffixes are enabled the
@@ -1543,8 +1463,6 @@ for the `QRestoreRegisterState` is added.
 
 ## QRestoreRegisterState:\<save_id\> / QRestoreRegisterState:\<save_id\>;thread:XXXX;
 
-### Brief
-
 The `QRestoreRegisterState` packet tells the remote debugserver to
 restore all registers using the `save_id` which is an unsigned
 integer that was returned from a previous call to
@@ -1568,8 +1486,6 @@ for the `QSaveRegisterState` is added.
 
 ## qFileLoadAddress:\<file_path\>
 
-### Brief
-
 Get the load address of a memory mapped file.
 The load address is defined as the address of the first memory
 region what contains data mapped from the specified file.
@@ -1587,8 +1503,6 @@ some object file in the rendezvous data structure.
 
 ## qModuleInfo:\<module_path\>;\<arch triple\>
 
-### Brief
-
 Get information for a module by given module path and architecture.
 
 ### Response
@@ -1603,8 +1517,6 @@ UUID directly from inferior's memory.
 
 ## jModulesInfo:[{"file":"...",triple:"..."}, ...]
 
-### Brief
-
 Get information for a list of modules by given module path and
 architecture.
 
@@ -1631,14 +1543,10 @@ the communication link has a non-negligible latency.
 
 ## Stop reply packet extensions
 
-### Brief
-
 This section describes some of the additional information you can
 specify in stop reply packets that help LLDB to know more detailed
 information about your threads.
 
-### Description
-
 Standard GDB remote stop reply packets are reply packets sent in
 response to a packet  that made the program run. They come in the
 following forms:
@@ -1847,19 +1755,15 @@ your debug session more reliable and informative.
 
 ## qfProcessInfo / qsProcessInfo (Platform Extension)
 
-### Brief
-
 Get the first process info (`qfProcessInfo`) or subsequent process
 info (`qsProcessInfo`) for one or more processes on the remote
 platform. The first call gets the first match and subsequent calls
 to `qsProcessInfo` gets the subsequent matches. Return an error `EXX`,
 where `XX` are two hex digits, when no more matches are available.
 
-### Priority To Implement
 
-Required. The `qfProcessInfo` packet can be followed by a `:` and
+ The `qfProcessInfo` packet can be followed by a `:` and
 some key value pairs. The key value pairs in the command are:
-
 * `name` - `ascii-hex` -
   An ASCII hex string that contains the name of the process that will be matched.
 * `name_match` - `enum` -
@@ -1900,15 +1804,48 @@ send packet: $qsProcessInfo#00
 read packet: $E04#00
 ```
 
-## qLaunchGDBServer (Platform Extension)
+### Priority To Implement
 
-### Brief
+Required.
 
-Have the remote platform launch a GDB server.
+## qPathComplete (Platform Extension)
 
-### Priority To Implement
+Get a list of matched disk files/directories by passing a boolean flag
+and a partial path.
+
+### Example
+
+```
+receive: qPathComplete:0,6d61696e
+send:    M6d61696e2e637070
+receive: qPathComplete:1,746573
+send:    M746573742f,74657374732f
+```
+
+If the first argument is zero, the result should contain all
+files (including directories) starting with the given path. If the
+argument is one, the result should contain only directories.
+
+The result should be a comma-separated list of hex-encoded paths.
+Paths denoting a directory should end with a directory separator (`/` or `\`.
+
+## qKillSpawnedProcess (Platform Extension)
+
+Kill a process running on the target system.
+
+### Example
+
+```
+receive: qKillSpawnedProcess:1337
+send:    OK
+```
+The request packet has the process ID in base 10.
+
+## qLaunchGDBServer (Platform Extension)
 
-Required. The `qLaunchGDBServer` packet must be followed by a `:` and
+Have the remote platform launch a GDB server.
+
+The `qLaunchGDBServer` packet must be followed by a `:` and
 some key value pairs. The key value pairs in the command are:
 * `port` - `integer` -
   A string value containing the decimal port ID or zero if the port should be
@@ -1916,11 +1853,6 @@ some key value pairs. The key value pairs in the command are:
 * `host` - `integer` -
   The host that connections should be limited to when the GDB server is connected to.
 
-### Description
-
-The response consists of key/value pairs where the key is separated from the
-values with colons and each pair is terminated with a semi colon.
-
 Sample packet/response:
 ```
 send packet: $qLaunchGDBServer:port:0;host:lldb.apple.com;#00
@@ -1934,20 +1866,15 @@ process was separately launched.
 The `port` key/value pair in the response lets clients know what port number
 to attach to in case zero was specified as the "port" in the sent command.
 
+### Priority To Implement
 
-## qProcessInfoPID:PID (Platform Extension)
+Required.
 
-### Brief
+## qProcessInfoPID:PID (Platform Extension)
 
 Have the remote platform get detailed information on a process by
 ID. PID is specified as a decimal integer.
 
-### Priority To Implement
-
-Optional.
-
-### Description
-
 The response consists of key/value pairs where the key is separated from the
 values with colons and each pair is terminated with a semi colon.
 
@@ -1967,9 +1894,11 @@ send packet: $qProcessInfoPID:60050#00
 read packet: $pid:60050;ppid:59948;uid:7746;gid:11;euid:7746;egid:11;name:6c6c6462;triple:x86_64-apple-macosx;#00
 ```
 
-## vAttachName
+### Priority To Implement
 
-### Brief
+Optional.
+
+## vAttachName
 
 Same as `vAttach`, except instead of a `pid` you send a process name.
 
@@ -1981,8 +1910,6 @@ it if attaching to a process by name makes sense for your environment.
 
 ## vAttachWait
 
-### Brief
-
 Same as `vAttachName`, except that the stub should wait for the next instance
 of a process by that name to be launched and attach to that.
 
@@ -1993,8 +1920,6 @@ gracefully if the packet is not supported.
 
 ## qAttachOrWaitSupported
 
-### Brief
-
 This is a binary "is it supported" query. Return OK if you support
 `vAttachOrWait`.
 
@@ -2006,8 +1931,6 @@ will do the right thing.
 
 ## vAttachOrWait
 
-### Brief
-
 Same as `vAttachWait`, except that the stub will attach to a process
 by name if it exists, and if it does not, it will wait for a process
 of that name to appear and attach to it.
@@ -2024,20 +1947,10 @@ support this packet.
 
 ## jThreadExtendedInfo
 
-### Brief
-
 This packet, which takes its arguments as JSON and sends its reply as
 JSON, allows the gdb remote stub to provide additional information
 about a given thread.
 
-### Priority To Implement
-
-Low.  This packet is only needed if the gdb remote stub wants to
-provide interesting additional information about a thread for the
-user.
-
-### Description
-
 This packet takes its arguments in [JSON](http://www.json.org).
 At a minimum, a thread must be specified, for example:
 ```
@@ -2085,9 +1998,13 @@ like:
 jThreadExtendedInfo:{"thread":612910}]
 ```
 
-## QEnableCompression
+### Priority To Implement
+
+Low.  This packet is only needed if the gdb remote stub wants to
+provide interesting additional information about a thread for the
+user.
 
-### Brief
+## QEnableCompression
 
 This packet enables compression of the packets that the debug stub sends to lldb.
 If the debug stub can support compression, it indictes this in the reply of the
@@ -2148,8 +2065,6 @@ Example compression algorithms that may be used include:
 
 ## jGetLoadedDynamicLibrariesInfos
 
-### Brief
-
 This packet asks the remote debug stub to send the details about libraries
 being added/removed from the process as a performance optimization.
 
@@ -2223,8 +2138,6 @@ STUB REPLIES: ${"images":
             }
 ```
 
-### Description
-
 This is similar to the `qXfer:libraries:read` packet, and it could
 be argued that it should be merged into that packet.  A separate
 packet was created primarily because lldb needs to specify the
@@ -2243,18 +2156,8 @@ executable loaded.
 
 ## jThreadsInfo
 
-### Brief
-
 Ask for the server for thread stop information of all threads.
 
-### Priority To Implement
-
-Low. This is a performance optimization, which speeds up debugging by avoiding
-multiple round-trips for retrieving thread information. The information from this
-packet can be retrieved using a combination of `qThreadStopInfo` and `m` packets.
-
-### Description
-
 The data in this packet is very similar to the stop reply packets, but is packaged in
 JSON and uses JSON arrays where applicable. The JSON output looks like:
 ```
@@ -2309,9 +2212,13 @@ On macOS with debugserver, we expedite the frame pointer backchain for a thread
 the previous FP and PC), and follow the backchain. Most backtraces on macOS and
 iOS now don't require us to read any memory!
 
-## jGetSharedCacheInfo
+### Priority To Implement
+
+Low. This is a performance optimization, which speeds up debugging by avoiding
+multiple round-trips for retrieving thread information. The information from this
+packet can be retrieved using a combination of `qThreadStopInfo` and `m` packets.
 
-### Brief
+## jGetSharedCacheInfo
 
 This packet asks the remote debug stub to send the details about the inferior's
 shared cache. The shared cache is a collection of common libraries/frameworks that
@@ -2332,17 +2239,8 @@ them from the inferior process.
 
 ## qQueryGDBServer
 
-### Brief
-
 Ask the platform for the list of gdbservers we have to connect
 
-### Priority To Implement
-
-Low. The packet is required to support connecting to gdbserver started
-by the platform instance automatically.
-
-### Description
-
 If the remote platform automatically started one or more gdbserver instance (without
 lldb asking it) then it have to return the list of port number or socket name for
 each of them what can be used by lldb to connect to those instances.
@@ -2360,29 +2258,28 @@ Example packet:
 ]
 ```
 
-## QSetDetachOnError
+### Priority To Implement
+
+Low. The packet is required to support connecting to gdbserver started
+by the platform instance automatically.
 
-### Brief
+## QSetDetachOnError
 
 Sets what the server should do when the communication channel with LLDB
 goes down. Either kill the inferior process (`0`) or remove breakpoints and
 detach (`1`).
 
+The data in this packet is a single a character, which should be `0` if the
+inferior process should be killed, or `1` if the server should remove all
+breakpoints and detach from the inferior.
+
 ### Priority To Implement
 
 Low. Only required if the target wants to keep the inferior process alive
 when the communication channel goes down.
 
-### Description
-
-The data in this packet is a single a character, which should be `0` if the
-inferior process should be killed, or `1` if the server should remove all
-breakpoints and detach from the inferior.
-
 ## jGetDyldProcessState
 
-### Brief
-
 This packet fetches the process launch state, as reported by libdyld on
 Darwin systems, most importantly to indicate when the system libraries
 have initialized sufficiently to safely call utility functions.
@@ -2397,3 +2294,180 @@ STUB REPLIES: {"process_state_value":48,"process_state string":"dyld_process_sta
 Low. This packet is needed to prevent lldb's utility functions for
 scanning the Objective-C class list from running very early in
 process startup.
+
+## vFile Packets
+
+Though some of these may match the ones described in GDB's protocol
+documentation, we include our own expectations here in case of
+mismatches or extensions.
+
+### vFile:size
+
+Get the size of a file on the target system, filename in ASCII hex.
+
+#### Example
+
+```
+receive: vFile:size:2f746d702f61
+send:    Fc008
+```
+
+response is `F` followed by the file size in base 16.
+`F-1,errno` with the errno if an error occurs, base 16.
+
+### vFile:mode
+
+Get the mode bits of a file on the target system, filename in ASCII hex.
+
+#### Example
+
+```
+receive: vFile:mode:2f746d702f61
+send:    F1ed
+```
+
+response is `F` followed by the mode bits in base 16, this `0x1ed` would
+correspond to `0755` in octal.
+`F-1,errno` with the errno if an error occurs, base 16.
+
+### vFile:unlink
+
+Remove a file on the target system.
+
+#### Example
+
+```
+receive: vFile:unlink:2f746d702f61
+send:    F0
+```
+
+Argument is a file path in ascii-hex encoding.
+Response is `F` plus the return value of `unlink()`, base 16 encoding.
+Return value may optionally be followed by a comma and the base16
+value of errno if unlink failed.
+
+### vFile:symlink
+
+Create a symbolic link (symlink, soft-link) on the target system.
+
+#### Example
+
+```
+receive: vFile:symlink:<SRC-FILE>,<DST-NAME>
+send:    F0,0
+```
+
+Argument file paths are in ascii-hex encoding.
+Response is `F` plus the return value of `symlink()`, base 16 encoding,
+optionally followed by the value of errno if it failed, also base 16.
+
+### vFile:open
+
+Open a file on the remote system and return the file descriptor of it.
+
+#### Example
+
+```
+receive: vFile:open:2f746d702f61,00000001,00000180
+send:    F8
+```
+
+request packet has the fields:
+   1. ASCII hex encoded filename
+   2. Flags passed to the open call, base 16.
+      Note that these are not the `oflags` that `open(2)` takes, but
+      are the constant values in `enum OpenOptions` from LLDB's
+      [`File.h`](https://github.com/llvm/llvm-project/blob/main/lldb/include/lldb/Host/File.h).
+   3. Mode bits, base 16
+
+response is `F` followed by the opened file descriptor in base 16.
+`F-1,errno` with the errno if an error occurs, base 16.
+
+### vFile:close
+
+Close a previously opened file descriptor.
+
+#### Example
+
+```
+receive: vFile:close:7
+send:    F0
+```
+
+File descriptor is in base 16. `F-1,errno` with the errno if an error occurs,
+errno is base 16.
+
+### vFile:pread
+
+Read data from an opened file descriptor.
+
+#### Example
+
+```
+receive: vFile:pread:7,1024,0
+send:    F4;a'b\00
+```
+
+Request packet has the fields:
+   1. File descriptor, base 16
+   2. Number of bytes to be read, base 16
+   3. Offset into file to start from, base 16
+
+Response is `F`, followed by the number of bytes read (base 16), a
+semicolon, followed by the data in the binary-escaped-data encoding.
+
+### vFile:pwrite
+
+Write data to a previously opened file descriptor.
+
+#### Example
+
+```
+receive: vFile:pwrite:8,0,\cf\fa\ed\fe\0c\00\00
+send:    F1024
+```
+
+Request packet has the fields:
+   1. File descriptor, base 16
+   2. Offset into file to start from, base 16
+   3. binary-escaped-data to be written
+
+Response is `F`, followed by the number of bytes written (base 16).
+
+### vFile:MD5
+
+Generate an MD5 hash of the file at the given path.
+
+#### Example
+
+```
+receive: vFile:MD5:2f746d702f61
+send (success): F,00000000000000001111111111111111
+send (failure): F,x
+```
+
+Request packet contains the ASCII hex encoded filename.
+
+If the hash succeeded, the response is `F,` followed by the low 64
+bits of the result, and finally the high 64 bits of the result. Both are in
+hex format without a prefix.
+
+The response is `F,`, followed by `x` if the file did not exist
+or failed to hash.
+
+### vFile:exists
+
+Check whether the file at the given path exists.
+
+#### Example
+
+```
+receive: vFile:exists:2f746d702f61
+send         (exists): F,1
+send (does not exist): F,0
+```
+
+Request packet contains the ASCII hex encoded filename.
+
+The response is a return code where 1 means the file exists
+and 0 means it does not.
+\ No newline at end of file
diff --git a/lldb/docs/resources/lldbplatformpackets.md b/lldb/docs/resources/lldbplatformpackets.md
index 326dd5669f79..f7262e193d94 100644
--- a/lldb/docs/resources/lldbplatformpackets.md
+++ b/lldb/docs/resources/lldbplatformpackets.md
@@ -1,484 +1,49 @@
 # LLDB Platform Packets
 
-Here is a brief overview of the packets that an lldb platform server
+This is a list of the packets that an lldb platform server
 needs to implement for the lldb testsuite to be run on a remote
 target device/system.
 
 These are almost all lldb extensions to the gdb-remote serial
-protocol.  Many of the `vFile:` packets are also described in the "Host
+protocol. Many of the `vFile:` packets are also described in the "Host
 I/O Packets" detailed in the gdb-remote protocol documentation,
 although the lldb platform extensions include packets that are not
 defined there (`vFile:size:`, `vFile:mode:`, `vFile:symlink`, `vFile:chmod:`).
 
-Most importantly, the flags that lldb passes to `vFile:open:` are
-incompatible with the flags that gdb specifies.
-
-## QStartNoAckMode
-
-### Brief
-
-A request to stop sending ACK packets for each properly formatted packet.
-
-### Example
-
-A platform session will typically start like this:
-```
-receive: +$QStartNoAckMode#b0
-send:    +       <-- ACKing the properly formatted QStartNoAckMode packet
-send:    $OK#9a
-receive: +       <-- Our OK packet getting ACKed
-```
-
-ACK mode is now disabled.
-
-## qHostInfo
-
-### Brief
-
-Describe the hardware and OS of the target system
-
-### Example
-
-```
-receive: qHostInfo
-send:    cputype:16777228;cpusubtype:1;ostype:ios;watchpoint_exceptions_received:before;os_version:12.1;vendor:apple;default_packet_timeout:5;
-```
-
-All numbers are base 10, `os_version` is a string that will be parsed as major.minor.patch.
-
-## qModuleInfo
-
-### Brief
-
-Get information for a module by given module path and architecture.
-
-The response is:
-* `(uuid|md5):...;triple:...;file_offset:...;file_size...;` or
-* `EXX` - for any errors
-
-### Example
-
-```
-receive: qModuleInfo:2f62696e2f6c73;
-```
-
-## qGetWorkingDir
-
-### Brief
-
-Get the current working directory of the platform stub in
-ASCII hex encoding.
-
-### Example
-
-```
-receive: qGetWorkingDir
-send:    2f4170706c65496e7465726e616c2f6c6c64622f73657474696e67732f342f5465737453657474696e67732e746573745f646973617373656d626c65725f73657474696e6773
-```
-
-## QSetWorkingDir
-
-### Brief
-
-Set the current working directory of the platform stub in
-ASCII hex encoding.
-
-### Example
-
-```
-receive: QSetWorkingDir:2f4170706c65496e7465726e616c2f6c6c64622f73657474696e67732f342f5465737453657474696e67732e746573745f646973617373656d626c65725f73657474696e6773
-send:    OK
-```
-
-## qPlatform_mkdir
-
-### Brief
-
-Create a directory on the target system.
-
-### Example
-
-```
-receive: qPlatform_mkdir:000001fd,2f746d702f6131
-send:    F0
-```
-
-request packet has the fields:
-   1. mode bits in base 16
-   2. file path in ASCII hex encoding
-
-response is F followed by the return value of the `mkdir()` call,
-base 16 encoded.
-
-## qPlatform_shell
-
-### Brief
-
-Run a shell command on the target system, return the output.
-
-### Example
-
-```
-receive: qPlatform_shell:6c73202f746d702f,0000000a
-send:    F,0,0,<OUTPUT>
-```
-
-request packet has the fields:
-   1. shell command in ASCII hex encoding
-   2. timeout
-   3. working directory in ASCII hex encoding (optional)
-
-Response is `F` followed by the return value of the command (base 16),
-followed by another number, followed by the output of the command
-in binary-escaped-data encoding.
-
-## qLaunchGDBServer
-
-### Brief
-
-Start a gdbserver process (`gdbserver`, `debugserver`, `lldb-server`)
-on the target system.
-
-### Example
-
-```
-receive: qLaunchGDBServer;host:<HOSTNAME_LLDB_IS_ON>;
-send:    pid:1337;port:43001;
-```
-
-Request packet hostname field is not ASCII hex encoded. Hostnames
-do not have `$` or `#` characters in them.
-
-Response to the packet is the pid of the newly launched gdbserver,
-and the port it is listening for a connection on.
-
-When the testsuite is running, lldb may use the pid to kill off a
-debugserver that doesn't seem to be responding, etc.
-
-## qKillSpawnedProcess
-
-### Brief
-
-Kill a process running on the target system.
-
-### Example
-
-```
-receive: qKillSpawnedProcess:1337
-send:    OK
-```
-The request packet has the process ID in base 10.
-
-## qProcessInfoPID:
-
-### Brief
-
-Gather information about a process running on the target.
-
-### Example
-
-```
-receive: qProcessInfoPID:71964
-send:    pid:71964;name:612e6f7574;
-```
-
-The request packet has the pid encoded in base 10.
-
-The reply has semicolon-separated `name:value` fields, two are
-shown here. `pid` is base 10 encoded. `name` is ascii hex encoded.
-lldb-server can reply with many additional fields, but this is probably
-enough for the testsuite.
-
-## qfProcessInfo
-
-### Brief
-
-Search the process table for processes matching criteria,
-respond with them in multiple packets.
-
-### Example
-
-```
-receive: qfProcessInfo:name_match:equals;name:6e6f70726f6365737365786973747377697468746869736e616d65;
-send:    pid:3500;name:612e6f7574;
-```
-
-The request packet has a criteria to search for, followed by
-a specific name.
-
-| Key          | Value     | Description
-| ------------ | --------- | -----------
-| `name`       | ascii-hex | An ASCII hex string that contains the name of the process that will be matched.
-| `name_match` | enum      | One of: `equals`, `starts_with`, `ends_with`, `contains` or `regex`
-| `pid`        | integer   | A string value containing the decimal process ID
-| `parent_pid` | integer   | A string value containing the decimal parent process ID
-| `uid`        | integer   | A string value containing the decimal user ID
-| `gid`        | integer   | A string value containing the decimal group ID
-| `euid`       | integer   | A string value containing the decimal effective user ID
-| `egid`       | integer   | A string value containing the decimal effective group ID
-| `all_users`  | bool      | A boolean value that specifies if processes should be listed for all users, not just the user that the platform is running as
-| `triple`     | ascii-hex | An ASCII hex target triple string (`x86_64`, `x86_64-apple-macosx`, `armv7-apple-ios`)
-
-If no criteria is given, `qfProcessInfo` will request a list of every process.
-
-The lldb testsuite currently only uses `name_match:equals` and the
-no-criteria mode to list every process.
-
-The response should include any information about the process that
-can be retrieved in semicolon-separated `name:value` fields.
-In this example, `pid` is base 10, `name` is ASCII hex encoded.
-The testsuite seems to only require these two.
-
-This packet only responds with one process. To get further matches to
-the search, `qsProcessInfo` should be sent.
-
-If no process match is found, `Exx` should be returned.
-
-Sample packet/response:
-```
-send packet: $qfProcessInfo#00
-read packet: $pid:60001;ppid:59948;uid:7746;gid:11;euid:7746;egid:11;name:6c6c6462;triple:7838365f36342d6170706c652d6d61636f7378;#00
-send packet: $qsProcessInfo#00
-read packet: $pid:59992;ppid:192;uid:7746;gid:11;euid:7746;egid:11;name:6d64776f726b6572;triple:7838365f36342d6170706c652d6d61636f7378;#00
-send packet: $qsProcessInfo#00
-read packet: $E04#00
-```
-
-## qsProcessInfo
-
-### Brief
-
-Return the next process info found by the most recent `qfProcessInfo:`
-packet.
-
-### Example
-
-Continues to return the results of the `qfProcessInfo`. Once all matches
-have been sent, `Exx` is returned to indicate end of matches.
-
-## qPathComplete
-
-### Brief
-
-Get a list of matched disk files/directories by passing a boolean flag
-and a partial path.
-
-### Example
-
-```
-receive: qPathComplete:0,6d61696e
-send:    M6d61696e2e637070
-receive: qPathComplete:1,746573
-send:    M746573742f,74657374732f
-```
-
-If the first argument is zero, the result should contain all
-files (including directories) starting with the given path. If the
-argument is one, the result should contain only directories.
-
-The result should be a comma-separated list of hex-encoded paths.
-Paths denoting a directory should end with a directory separator (`/` or `\`).
-
-## vFile:size
-
-### Brief
-
-Get the size of a file on the target system, filename in ASCII hex encoding.
-
-### Example
-
-```
-receive: vFile:size:2f746d702f61
-send:    Fc008
-```
-
-response is `F` followed by the file size in base 16.
-`F-1,errno` with the errno if an error occurs, base 16.
-
-## vFile:mode
-
-### Brief
-
-Get the mode bits of a file on the target system, filename in ASCII hex.
-
-### Example
-
-```
-receive: vFile:mode:2f746d702f61
-send:    F1ed
-```
-
-response is `F` followed by the mode bits in base 16, this `0x1ed` would
-correspond to `0755` in octal.
-`F-1,errno` with the errno if an error occurs, base 16.
-
-## vFile:unlink
-
-### Brief
-
-Remove a file on the target system.
-
-### Example
-
-```
-receive: vFile:unlink:2f746d702f61
-send:    F0
-```
-
-Argument is a file path in ascii-hex encoding.
-
-Response is `F` plus the return value of `unlink()` in base 16 encoding.
-If unlink failed, the return value may be followed by a comma and the value of
-errno in base 16 encoding.
-
-## vFile:symlink
-
-### Brief
-
-Create a symbolic link (symlink, soft-link) on the target system.
-
-### Example
-
-```
-receive: vFile:symlink:<SRC-FILE>,<DST-NAME>
-send:    F0,0
-```
-
-Argument file paths are in ascii-hex encoding.
-Response is `F` plus the return value of `symlink()`, base 16 encoding,
-optionally followed by the value of errno if it failed, also base 16.
-
-## vFile:chmod / qPlatform_chmod
-
-### Brief
-
-Change the permission mode bits on a file on the target
-
-### Example
-
-```
-receive: vFile:chmod:180,2f746d702f61
-send:    F0
-```
-
-Arguments are the mode bits to set, base 16, and a file path in
-ascii-hex encoding.
-Response is `F` plus the return value of `chmod()`, base 16 encoding.
-
-These 2 packets do the same thing, it is not known why we ended up with 2.
-
-## vFile:chmod
-
-### Brief
-
-Change the permission mode bits on a file on the target.
-
-### Example
-
-```
-receive: vFile:chmod:180,2f746d702f61
-send:    F0
-```
-
-Arguments are the mode bits to set, base 16, and a file path in
-ascii-hex encoding.
-Response is `F` plus the return value of `chmod()`, base 10 encoding.
-
-## vFile:open
-
-### Brief
-
-Open a file on the remote system and return the file descriptor of it.
-
-### Example
-
-```
-receive: vFile:open:2f746d702f61,00000001,00000180
-send:    F8
-```
-
-request packet has the fields:
-   1. ASCII hex encoded filename
-   2. Flags passed to the open call, base 16.
-      Note that these are not the `oflags` that `open(2)` takes, but
-      are the constant values in `enum OpenOptions` from LLDB's
-      [`File.h`](https://github.com/llvm/llvm-project/blob/main/lldb/include/lldb/Host/File.h).
-   3. Mode bits, base 16
-
-response is `F` followed by the opened file descriptor in base 16.
-`F-1,errno` with the errno if an error occurs, base 16.
-
-## vFile:close
-
-### Brief
-
-Close a previously opened file descriptor.
-
-### Example
-
-```
-receive: vFile:close:7
-send:    F0
-```
-
-File descriptor is in base 16. `F-1,errno` with the errno if an error occurs,
-errno is base 16.
-
-## vFile:pread
-
-### Brief
-
-Read data from an opened file descriptor.
-
-### Example
-
-```
-receive: vFile:pread:7,1024,0
-send:    F4;a'b\00
-```
-
-Request packet has the fields:
-   1. File descriptor, base 16
-   2. Number of bytes to be read, base 16
-   3. Offset into file to start from, base 16
-
-Response is `F`, followed by the number of bytes read (base 16 encoded), a
-semicolon, followed by the data in the binary-escaped-data encoding.
-
-## vFile:pwrite
-
-### Brief
-
-Write data to a previously opened file descriptor.
-
-### Example
-
-```
-receive: vFile:pwrite:8,0,\cf\fa\ed\fe\0c\00\00
-send:    F1024
-```
-
-Request packet has the fields:
-   1. File descriptor, base 16
-   2. Offset into file to start from, base 16
-   3. binary-escaped-data to be written
-
-Response is `F`, followed by the number of bytes written (base 16 encoded).
-
-## Launching Processes
-
-Finally, the platform must be able to launch processes so that debugserver
-can attach to them. To do this, the following packets should be handled:
-* `QSetDisableASLR`
-* `QSetDetachOnError`
-* `QSetSTDOUT`
-* `QSetSTDERR`
-* `QSetSTDIN`
-* `QEnvironment`
-* `QEnvironmentHexEncoded`
-* `A`
-* `qLaunchSuccess`
-* `qProcessInfo`
-
-Most of these are documented in the standard gdb-remote protocol
-and/or LLDB's [GDB Remote Protocol Extensions](lldbgdbremote).
+Most importantly, the flags that LLDB passes to `vFile:open:` are
+incompatible with the flags that GDB specifies.
+
+* [QStartNoAckMode](./lldbgdbremote.md#qstartnoackmode)
+* [qHostInfo](./lldbgdbremote.md#qhostinfo)
+* [qModuleInfo](./lldbgdbremote.md#qmoduleinfo-module-path-arch-triple)
+* [qGetWorkingDir](./lldbgdbremote.md#qgetworkingdir)
+* [QSetWorkingDir](./lldbgdbremote.md#qsetworkingdir-ascii-hex-path)
+* [qPlatform_mkdir](./lldbgdbremote.md#qplatform-mkdir)
+* [qPlatform_shell](./lldbgdbremote.md#qplatform-shell)
+* [qLaunchGDBServer](./lldbgdbremote.md#qlaunchgdbserver-platform-extension)
+* [qKillSpawnedProcess](./lldbgdbremote.md#qkillspawnedprocess-platform-extension)
+* [qProcessInfoPID](./lldbgdbremote.md#qprocessinfopid-pid-platform-extension)
+  * It is likely that you only need to support the `pid` and `name` fields.
+* [qProcessInfo](./lldbgdbremote.md#qprocessinfo)
+  * The lldb test suite currently only uses `name_match:equals` and the no-criteria mode to list every process.
+* [qPathComplete](./lldbgdbremote.md#qpathcomplete-platform-extension)
+* [vFile:chmod](./lldbgdbremote.md#vfile-chmod-qplatform-chmod)
+* [vFile:size](./lldbgdbremote.md#vfile-size)
+* [vFile:mode](./lldbgdbremote.md#vfile-mode)
+* [vFile:unlink](./lldbgdbremote.md#vfile-unlink)
+* [vFile:symlink](./lldbgdbremote.md#vfile-symlink)
+* [vFile:open](./lldbgdbremote.md#vfile-open)
+* [vFile:close](./lldbgdbremote.md#vfile-close)
+* [vFile:pread](./lldbgdbremote.md#vfile-pread)
+* [vFile:pwrite](./lldbgdbremote.md#vfile-pwrite)
+
+The remote platform must be able to launch processes so that debugserver
+can attach to them. This requires the following packets in addition to the
+previous list:
+* [QSetDisableASLR](./lldbgdbremote.md#qsetdisableaslr-bool)
+* [QSetDetatchOnError](./lldbgdbremote.md#qsetdetachonerror)
+* [QSetSTDIN / QSetSTDOUT / QSetSTDERR](./lldbgdbremote.md#qsetstdin-ascii-hex-path-qsetstdout-ascii-hex-path-qsetstderr-ascii-hex-path) (all 3)
+* [QEnvironment](./lldbgdbremote.md#qenvironment-name-value)
+* [QEnvironmentHexEncoded](./lldbgdbremote.md#qenvironmenthexencoded-hex-encoding-name-value)
+* [A](./lldbgdbremote.md#a-launch-args-packet)
+* [qLaunchSuccess](./lldbgdbremote.md#qlaunchsuccess)
diff --git a/lldb/include/lldb/API/SBDebugger.h b/lldb/include/lldb/API/SBDebugger.h
index cf5409a12a05..7333cd57ad31 100644
--- a/lldb/include/lldb/API/SBDebugger.h
+++ b/lldb/include/lldb/API/SBDebugger.h
@@ -42,6 +42,13 @@ public:
 
 class LLDB_API SBDebugger {
 public:
+  FLAGS_ANONYMOUS_ENUM(){
+      eBroadcastBitProgress = lldb::DebuggerBroadcastBit::eBroadcastBitProgress,
+      eBroadcastBitWarning = lldb::DebuggerBroadcastBit::eBroadcastBitWarning,
+      eBroadcastBitError = lldb::DebuggerBroadcastBit::eBroadcastBitError,
+      eBroadcastBitProgressCategory =
+          lldb::DebuggerBroadcastBit::eBroadcastBitProgressCategory,
+  };
   SBDebugger();
 
   SBDebugger(const lldb::SBDebugger &rhs);
diff --git a/lldb/include/lldb/API/SBValue.h b/lldb/include/lldb/API/SBValue.h
index 67f55ce7da28..8f4c4fd56dfb 100644
--- a/lldb/include/lldb/API/SBValue.h
+++ b/lldb/include/lldb/API/SBValue.h
@@ -68,6 +68,8 @@ public:
 
   uint64_t GetValueAsUnsigned(uint64_t fail_value = 0);
 
+  lldb::addr_t GetValueAsAddress();
+
   ValueType GetValueType();
 
   // If you call this on a newly created ValueObject, it will always return
diff --git a/lldb/include/lldb/Utility/ProcessInfo.h b/lldb/include/lldb/Utility/ProcessInfo.h
index e9fe71e1b851..54ac000dc7fc 100644
--- a/lldb/include/lldb/Utility/ProcessInfo.h
+++ b/lldb/include/lldb/Utility/ProcessInfo.h
@@ -234,7 +234,7 @@ public:
 
   bool CumulativeSystemTimeIsValid() const {
     return m_cumulative_system_time.tv_sec > 0 ||
-           m_cumulative_system_time.tv_sec > 0;
+           m_cumulative_system_time.tv_usec > 0;
   }
 
   void Dump(Stream &s, UserIDResolver &resolver) const;
diff --git a/lldb/source/API/SBValue.cpp b/lldb/source/API/SBValue.cpp
index 94a8f3ea319e..c53ec5a74648 100644
--- a/lldb/source/API/SBValue.cpp
+++ b/lldb/source/API/SBValue.cpp
@@ -909,6 +909,25 @@ uint64_t SBValue::GetValueAsUnsigned(uint64_t fail_value) {
   return fail_value;
 }
 
+lldb::addr_t SBValue::GetValueAsAddress() {
+  addr_t fail_value = LLDB_INVALID_ADDRESS;
+  ValueLocker locker;
+  lldb::ValueObjectSP value_sp(GetSP(locker));
+  if (value_sp) {
+    bool success = true;
+    uint64_t ret_val = fail_value;
+    ret_val = value_sp->GetValueAsUnsigned(fail_value, &success);
+    if (!success)
+      return fail_value;
+    ProcessSP process_sp = m_opaque_sp->GetProcessSP();
+    if (!process_sp)
+      return ret_val;
+    return process_sp->FixDataAddress(ret_val);
+  }
+
+  return fail_value;
+}
+
 bool SBValue::MightHaveChildren() {
   LLDB_INSTRUMENT_VA(this);
 
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
index d3fc487aed43..9409497f1c81 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
@@ -1869,15 +1869,15 @@ AppleObjCRuntimeV2::DynamicClassInfoExtractor::ComputeHelper(
       if (loader->IsFullyInitialized()) {
         switch (exe_ctx.GetTargetRef().GetDynamicClassInfoHelper()) {
         case eDynamicClassInfoHelperAuto:
-          LLVM_FALLTHROUGH;
+          [[fallthrough]];
         case eDynamicClassInfoHelperGetRealizedClassList:
           if (m_runtime.m_has_objc_getRealizedClassList_trylock)
             return DynamicClassInfoExtractor::objc_getRealizedClassList_trylock;
-          LLVM_FALLTHROUGH;
+          [[fallthrough]];
         case eDynamicClassInfoHelperCopyRealizedClassList:
           if (m_runtime.m_has_objc_copyRealizedClassList)
             return DynamicClassInfoExtractor::objc_copyRealizedClassList;
-          LLVM_FALLTHROUGH;
+          [[fallthrough]];
         case eDynamicClassInfoHelperRealizedClassesStruct:
           return DynamicClassInfoExtractor::gdb_objc_realized_classes;
         }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 41d81fbcf1b0..12dafd3f5d5d 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -495,6 +495,7 @@ TypeSP DWARFASTParserClang::ParseTypeFromDWARF(const SymbolContext &sc,
   case DW_TAG_const_type:
   case DW_TAG_restrict_type:
   case DW_TAG_volatile_type:
+  case DW_TAG_LLVM_ptrauth_type:
   case DW_TAG_atomic_type:
   case DW_TAG_unspecified_type: {
     type_sp = ParseTypeModifier(sc, die, attrs);
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.h
index dd130977d4b1..b8344f548ac3 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.h
@@ -32,7 +32,7 @@ public:
 
 private:
   DWARFCompileUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
-                   const DWARFUnitHeader &header,
+                   const llvm::DWARFUnitHeader &header,
                    const llvm::DWARFAbbreviationDeclarationSet &abbrevs,
                    DIERef::Section section, bool is_dwo)
       : DWARFUnit(dwarf, uid, header, abbrevs, section, is_dwo) {}
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFTypeUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFTypeUnit.h
index 7b58c632c6c5..8c1f932d8c7f 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFTypeUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFTypeUnit.h
@@ -24,15 +24,15 @@ public:
 
   void Dump(Stream *s) const override;
 
-  uint64_t GetTypeHash() { return m_header.GetTypeHash(); }
+  uint64_t GetTypeHash() { return m_header.getTypeHash(); }
 
-  dw_offset_t GetTypeOffset() { return GetOffset() + m_header.GetTypeOffset(); }
+  dw_offset_t GetTypeOffset() { return GetOffset() + m_header.getTypeOffset(); }
 
   static bool classof(const DWARFUnit *unit) { return unit->IsTypeUnit(); }
 
 private:
   DWARFTypeUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
-                const DWARFUnitHeader &header,
+                const llvm::DWARFUnitHeader &header,
                 const llvm::DWARFAbbreviationDeclarationSet &abbrevs,
                 DIERef::Section section, bool is_dwo)
       : DWARFUnit(dwarf, uid, header, abbrevs, section, is_dwo) {}
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
index e28036d34b34..dabc595427df 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
@@ -33,12 +33,12 @@ using namespace lldb_private::plugin::dwarf;
 extern int g_verbose;
 
 DWARFUnit::DWARFUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
-                     const DWARFUnitHeader &header,
+                     const llvm::DWARFUnitHeader &header,
                      const llvm::DWARFAbbreviationDeclarationSet &abbrevs,
                      DIERef::Section section, bool is_dwo)
     : UserID(uid), m_dwarf(dwarf), m_header(header), m_abbrevs(&abbrevs),
       m_cancel_scopes(false), m_section(section), m_is_dwo(is_dwo),
-      m_has_parsed_non_skeleton_unit(false), m_dwo_id(header.GetDWOId()) {}
+      m_has_parsed_non_skeleton_unit(false), m_dwo_id(header.getDWOId()) {}
 
 DWARFUnit::~DWARFUnit() = default;
 
@@ -345,7 +345,7 @@ void DWARFUnit::ExtractDIEsRWLocked() {
 void DWARFUnit::SetDwoStrOffsetsBase() {
   lldb::offset_t baseOffset = 0;
 
-  if (const llvm::DWARFUnitIndex::Entry *entry = m_header.GetIndexEntry()) {
+  if (const llvm::DWARFUnitIndex::Entry *entry = m_header.getIndexEntry()) {
     if (const auto *contribution =
             entry->getContribution(llvm::DW_SECT_STR_OFFSETS))
       baseOffset = contribution->getOffset();
@@ -489,7 +489,7 @@ ParseListTableHeader(const llvm::DWARFDataExtractor &data, uint64_t offset,
 
 void DWARFUnit::SetLoclistsBase(dw_addr_t loclists_base) {
   uint64_t offset = 0;
-  if (const llvm::DWARFUnitIndex::Entry *entry = m_header.GetIndexEntry()) {
+  if (const llvm::DWARFUnitIndex::Entry *entry = m_header.getIndexEntry()) {
     const auto *contribution = entry->getContribution(llvm::DW_SECT_LOCLISTS);
     if (!contribution) {
       GetSymbolFileDWARF().GetObjectFile()->GetModule()->ReportError(
@@ -533,7 +533,7 @@ DWARFDataExtractor DWARFUnit::GetLocationData() const {
   DWARFContext &Ctx = GetSymbolFileDWARF().GetDWARFContext();
   const DWARFDataExtractor &data =
       GetVersion() >= 5 ? Ctx.getOrLoadLocListsData() : Ctx.getOrLoadLocData();
-  if (const llvm::DWARFUnitIndex::Entry *entry = m_header.GetIndexEntry()) {
+  if (const llvm::DWARFUnitIndex::Entry *entry = m_header.getIndexEntry()) {
     if (const auto *contribution = entry->getContribution(
             GetVersion() >= 5 ? llvm::DW_SECT_LOCLISTS : llvm::DW_SECT_EXT_LOC))
       return DWARFDataExtractor(data, contribution->getOffset(),
@@ -546,7 +546,7 @@ DWARFDataExtractor DWARFUnit::GetLocationData() const {
 DWARFDataExtractor DWARFUnit::GetRnglistData() const {
   DWARFContext &Ctx = GetSymbolFileDWARF().GetDWARFContext();
   const DWARFDataExtractor &data = Ctx.getOrLoadRngListsData();
-  if (const llvm::DWARFUnitIndex::Entry *entry = m_header.GetIndexEntry()) {
+  if (const llvm::DWARFUnitIndex::Entry *entry = m_header.getIndexEntry()) {
     if (const auto *contribution =
             entry->getContribution(llvm::DW_SECT_RNGLISTS))
       return DWARFDataExtractor(data, contribution->getOffset(),
@@ -924,84 +924,6 @@ const DWARFDebugAranges &DWARFUnit::GetFunctionAranges() {
   return *m_func_aranges_up;
 }
 
-llvm::Error DWARFUnitHeader::ApplyIndexEntry(
-    const llvm::DWARFUnitIndex::Entry *index_entry) {
-  // We should only be calling this function when the index entry is not set and
-  // we have a valid one to set it to.
-  assert(index_entry);
-  assert(!m_index_entry);
-
-  if (m_abbr_offset)
-    return llvm::createStringError(
-        llvm::inconvertibleErrorCode(),
-        "Package unit with a non-zero abbreviation offset");
-
-  auto *unit_contrib = index_entry->getContribution();
-  if (!unit_contrib || unit_contrib->getLength32() != m_length + 4)
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "Inconsistent DWARF package unit index");
-
-  auto *abbr_entry = index_entry->getContribution(llvm::DW_SECT_ABBREV);
-  if (!abbr_entry)
-    return llvm::createStringError(
-        llvm::inconvertibleErrorCode(),
-        "DWARF package index missing abbreviation column");
-
-  m_abbr_offset = abbr_entry->getOffset();
-  m_index_entry = index_entry;
-  return llvm::Error::success();
-}
-
-llvm::Expected<DWARFUnitHeader>
-DWARFUnitHeader::extract(const DWARFDataExtractor &data,
-                         DIERef::Section section, DWARFContext &context,
-                         lldb::offset_t *offset_ptr) {
-  DWARFUnitHeader header;
-  header.m_offset = *offset_ptr;
-  header.m_length = data.GetDWARFInitialLength(offset_ptr);
-  header.m_version = data.GetU16(offset_ptr);
-  if (header.m_version == 5) {
-    header.m_unit_type = data.GetU8(offset_ptr);
-    header.m_addr_size = data.GetU8(offset_ptr);
-    header.m_abbr_offset = data.GetDWARFOffset(offset_ptr);
-    if (header.m_unit_type == llvm::dwarf::DW_UT_skeleton ||
-        header.m_unit_type == llvm::dwarf::DW_UT_split_compile)
-      header.m_dwo_id = data.GetU64(offset_ptr);
-  } else {
-    header.m_abbr_offset = data.GetDWARFOffset(offset_ptr);
-    header.m_addr_size = data.GetU8(offset_ptr);
-    header.m_unit_type =
-        section == DIERef::Section::DebugTypes ? DW_UT_type : DW_UT_compile;
-  }
-
-  if (header.IsTypeUnit()) {
-    header.m_type_hash = data.GetU64(offset_ptr);
-    header.m_type_offset = data.GetDWARFOffset(offset_ptr);
-  }
-
-  bool length_OK = data.ValidOffset(header.GetNextUnitOffset() - 1);
-  bool version_OK = SymbolFileDWARF::SupportedVersion(header.m_version);
-  bool addr_size_OK = (header.m_addr_size == 2) || (header.m_addr_size == 4) ||
-                      (header.m_addr_size == 8);
-  bool type_offset_OK =
-      !header.IsTypeUnit() || (header.m_type_offset <= header.GetLength());
-
-  if (!length_OK)
-    return llvm::make_error<llvm::object::GenericBinaryError>(
-        "Invalid unit length");
-  if (!version_OK)
-    return llvm::make_error<llvm::object::GenericBinaryError>(
-        "Unsupported unit version");
-  if (!addr_size_OK)
-    return llvm::make_error<llvm::object::GenericBinaryError>(
-        "Invalid unit address size");
-  if (!type_offset_OK)
-    return llvm::make_error<llvm::object::GenericBinaryError>(
-        "Type offset out of range");
-
-  return header;
-}
-
 llvm::Expected<DWARFUnitSP>
 DWARFUnit::extract(SymbolFileDWARF &dwarf, user_id_t uid,
                    const DWARFDataExtractor &debug_info,
@@ -1009,26 +931,35 @@ DWARFUnit::extract(SymbolFileDWARF &dwarf, user_id_t uid,
   assert(debug_info.ValidOffset(*offset_ptr));
 
   DWARFContext &context = dwarf.GetDWARFContext();
-  auto expected_header =
-      DWARFUnitHeader::extract(debug_info, section, context, offset_ptr);
-  if (!expected_header)
-    return expected_header.takeError();
+
+  // FIXME: Either properly map between DIERef::Section and
+  // llvm::DWARFSectionKind or switch to llvm's definition entirely.
+  llvm::DWARFSectionKind section_kind_llvm =
+      section == DIERef::Section::DebugInfo
+          ? llvm::DWARFSectionKind::DW_SECT_INFO
+          : llvm::DWARFSectionKind::DW_SECT_EXT_TYPES;
+
+  llvm::DWARFDataExtractor debug_info_llvm = debug_info.GetAsLLVMDWARF();
+  llvm::DWARFUnitHeader header;
+  if (llvm::Error extract_err = header.extract(
+          context.GetAsLLVM(), debug_info_llvm, offset_ptr, section_kind_llvm))
+    return std::move(extract_err);
 
   if (context.isDwo()) {
     const llvm::DWARFUnitIndex::Entry *entry = nullptr;
-    const llvm::DWARFUnitIndex &index = expected_header->IsTypeUnit()
+    const llvm::DWARFUnitIndex &index = header.isTypeUnit()
                                             ? context.GetAsLLVM().getTUIndex()
                                             : context.GetAsLLVM().getCUIndex();
     if (index) {
-      if (expected_header->IsTypeUnit())
-        entry = index.getFromHash(expected_header->GetTypeHash());
-      else if (auto dwo_id = expected_header->GetDWOId())
+      if (header.isTypeUnit())
+        entry = index.getFromHash(header.getTypeHash());
+      else if (auto dwo_id = header.getDWOId())
         entry = index.getFromHash(*dwo_id);
     }
     if (!entry)
-      entry = index.getFromOffset(expected_header->GetOffset());
+      entry = index.getFromOffset(header.getOffset());
     if (entry)
-      if (llvm::Error err = expected_header->ApplyIndexEntry(entry))
+      if (llvm::Error err = header.applyIndexEntry(entry))
         return std::move(err);
   }
 
@@ -1039,13 +970,13 @@ DWARFUnit::extract(SymbolFileDWARF &dwarf, user_id_t uid,
 
   bool abbr_offset_OK =
       dwarf.GetDWARFContext().getOrLoadAbbrevData().ValidOffset(
-          expected_header->GetAbbrOffset());
+          header.getAbbrOffset());
   if (!abbr_offset_OK)
     return llvm::make_error<llvm::object::GenericBinaryError>(
         "Abbreviation offset for unit is not valid");
 
   llvm::Expected<const llvm::DWARFAbbreviationDeclarationSet *> abbrevs_or_err =
-      abbr->getAbbreviationDeclarationSet(expected_header->GetAbbrOffset());
+      abbr->getAbbreviationDeclarationSet(header.getAbbrOffset());
   if (!abbrevs_or_err)
     return abbrevs_or_err.takeError();
 
@@ -1055,11 +986,11 @@ DWARFUnit::extract(SymbolFileDWARF &dwarf, user_id_t uid,
         "No abbrev exists at the specified offset.");
 
   bool is_dwo = dwarf.GetDWARFContext().isDwo();
-  if (expected_header->IsTypeUnit())
-    return DWARFUnitSP(new DWARFTypeUnit(dwarf, uid, *expected_header, *abbrevs,
-                                         section, is_dwo));
-  return DWARFUnitSP(new DWARFCompileUnit(dwarf, uid, *expected_header,
-                                          *abbrevs, section, is_dwo));
+  if (header.isTypeUnit())
+    return DWARFUnitSP(
+        new DWARFTypeUnit(dwarf, uid, header, *abbrevs, section, is_dwo));
+  return DWARFUnitSP(
+      new DWARFCompileUnit(dwarf, uid, header, *abbrevs, section, is_dwo));
 }
 
 const lldb_private::DWARFDataExtractor &DWARFUnit::GetData() const {
@@ -1069,7 +1000,7 @@ const lldb_private::DWARFDataExtractor &DWARFUnit::GetData() const {
 }
 
 uint32_t DWARFUnit::GetHeaderByteSize() const {
-  switch (m_header.GetUnitType()) {
+  switch (m_header.getUnitType()) {
   case llvm::dwarf::DW_UT_compile:
   case llvm::dwarf::DW_UT_partial:
     return GetVersion() < 5 ? 11 : 12;
@@ -1106,7 +1037,7 @@ DWARFUnit::FindRnglistFromOffset(dw_offset_t offset) {
   llvm::DWARFDataExtractor data = GetRnglistData().GetAsLLVMDWARF();
 
   // As DW_AT_rnglists_base may be missing we need to call setAddressSize.
-  data.setAddressSize(m_header.GetAddressByteSize());
+  data.setAddressSize(m_header.getAddressByteSize());
   auto range_list_or_error = GetRnglistTable()->findList(data, offset);
   if (!range_list_or_error)
     return range_list_or_error.takeError();
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
index 28981b51bfcb..85c37971ced8 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
@@ -38,54 +38,6 @@ enum DWARFProducer {
   eProducerOther
 };
 
-/// Base class describing the header of any kind of "unit."  Some information
-/// is specific to certain unit types.  We separate this class out so we can
-/// parse the header before deciding what specific kind of unit to construct.
-class DWARFUnitHeader {
-  dw_offset_t m_offset = 0;
-  dw_offset_t m_length = 0;
-  uint16_t m_version = 0;
-  dw_offset_t m_abbr_offset = 0;
-
-  const llvm::DWARFUnitIndex::Entry *m_index_entry = nullptr;
-
-  uint8_t m_unit_type = 0;
-  uint8_t m_addr_size = 0;
-
-  uint64_t m_type_hash = 0;
-  uint32_t m_type_offset = 0;
-
-  std::optional<uint64_t> m_dwo_id;
-
-  DWARFUnitHeader() = default;
-
-public:
-  dw_offset_t GetOffset() const { return m_offset; }
-  uint16_t GetVersion() const { return m_version; }
-  uint16_t GetAddressByteSize() const { return m_addr_size; }
-  dw_offset_t GetLength() const { return m_length; }
-  dw_offset_t GetAbbrOffset() const { return m_abbr_offset; }
-  uint8_t GetUnitType() const { return m_unit_type; }
-  const llvm::DWARFUnitIndex::Entry *GetIndexEntry() const {
-    return m_index_entry;
-  }
-  uint64_t GetTypeHash() const { return m_type_hash; }
-  dw_offset_t GetTypeOffset() const { return m_type_offset; }
-  std::optional<uint64_t> GetDWOId() const { return m_dwo_id; }
-  bool IsTypeUnit() const {
-    return m_unit_type == llvm::dwarf::DW_UT_type ||
-           m_unit_type == llvm::dwarf::DW_UT_split_type;
-  }
-  dw_offset_t GetNextUnitOffset() const { return m_offset + m_length + 4; }
-
-  llvm::Error ApplyIndexEntry(const llvm::DWARFUnitIndex::Entry *index_entry);
-
-  static llvm::Expected<DWARFUnitHeader> extract(const DWARFDataExtractor &data,
-                                                 DIERef::Section section,
-                                                 DWARFContext &dwarf_context,
-                                                 lldb::offset_t *offset_ptr);
-};
-
 class DWARFUnit : public UserID {
   using die_iterator_range =
       llvm::iterator_range<DWARFDebugInfoEntry::collection::iterator>;
@@ -105,7 +57,7 @@ public:
   /// the DWO ID in the compile unit header and we sometimes only want to access
   /// this cheap value without causing the more expensive attribute fetches that
   /// GetDWOId() uses.
-  std::optional<uint64_t> GetHeaderDWOId() { return m_header.GetDWOId(); }
+  std::optional<uint64_t> GetHeaderDWOId() { return m_header.getDWOId(); }
   void ExtractUnitDIEIfNeeded();
   void ExtractUnitDIENoDwoIfNeeded();
   void ExtractDIEsIfNeeded();
@@ -143,7 +95,7 @@ public:
   uint32_t GetHeaderByteSize() const;
 
   // Offset of the initial length field.
-  dw_offset_t GetOffset() const { return m_header.GetOffset(); }
+  dw_offset_t GetOffset() const { return m_header.getOffset(); }
   /// Get the size in bytes of the length field in the header.
   ///
   /// In DWARF32 this is just 4 bytes
@@ -159,15 +111,15 @@ public:
   dw_offset_t GetFirstDIEOffset() const {
     return GetOffset() + GetHeaderByteSize();
   }
-  dw_offset_t GetNextUnitOffset() const { return m_header.GetNextUnitOffset(); }
+  dw_offset_t GetNextUnitOffset() const { return m_header.getNextUnitOffset(); }
   // Size of the CU data (without initial length and without header).
   size_t GetDebugInfoSize() const;
   // Size of the CU data incl. header but without initial length.
-  dw_offset_t GetLength() const { return m_header.GetLength(); }
-  uint16_t GetVersion() const { return m_header.GetVersion(); }
+  dw_offset_t GetLength() const { return m_header.getLength(); }
+  uint16_t GetVersion() const { return m_header.getVersion(); }
   const llvm::DWARFAbbreviationDeclarationSet *GetAbbreviations() const;
   dw_offset_t GetAbbrevOffset() const;
-  uint8_t GetAddressByteSize() const { return m_header.GetAddressByteSize(); }
+  uint8_t GetAddressByteSize() const { return m_header.getAddressByteSize(); }
   dw_addr_t GetAddrBase() const { return m_addr_base.value_or(0); }
   dw_addr_t GetBaseAddress() const { return m_base_addr; }
   dw_offset_t GetLineTableOffset();
@@ -250,8 +202,8 @@ public:
 
   DIERef::Section GetDebugSection() const { return m_section; }
 
-  uint8_t GetUnitType() const { return m_header.GetUnitType(); }
-  bool IsTypeUnit() const { return m_header.IsTypeUnit(); }
+  uint8_t GetUnitType() const { return m_header.getUnitType(); }
+  bool IsTypeUnit() const { return m_header.isTypeUnit(); }
   /// Note that this check only works for DWARF5+.
   bool IsSkeletonUnit() const {
     return GetUnitType() == llvm::dwarf::DW_UT_skeleton;
@@ -320,7 +272,7 @@ public:
 
 protected:
   DWARFUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
-            const DWARFUnitHeader &header,
+            const llvm::DWARFUnitHeader &header,
             const llvm::DWARFAbbreviationDeclarationSet &abbrevs,
             DIERef::Section section, bool is_dwo);
 
@@ -352,7 +304,7 @@ protected:
 
   SymbolFileDWARF &m_dwarf;
   std::shared_ptr<DWARFUnit> m_dwo;
-  DWARFUnitHeader m_header;
+  llvm::DWARFUnitHeader m_header;
   const llvm::DWARFAbbreviationDeclarationSet *m_abbrevs = nullptr;
   lldb_private::CompileUnit *m_lldb_cu = nullptr;
   // If this is a DWO file, we have a backlink to our skeleton compile unit.
diff --git a/lldb/test/API/clear-sbvalue-nonaddressable-bits/Makefile b/lldb/test/API/clear-sbvalue-nonaddressable-bits/Makefile
new file mode 100644
index 000000000000..10495940055b
--- /dev/null
+++ b/lldb/test/API/clear-sbvalue-nonaddressable-bits/Makefile
@@ -0,0 +1,3 @@
+C_SOURCES := main.c
+
+include Makefile.rules
diff --git a/lldb/test/API/clear-sbvalue-nonaddressable-bits/TestClearSBValueNonAddressableBits.py b/lldb/test/API/clear-sbvalue-nonaddressable-bits/TestClearSBValueNonAddressableBits.py
new file mode 100644
index 000000000000..382b0e7a81d2
--- /dev/null
+++ b/lldb/test/API/clear-sbvalue-nonaddressable-bits/TestClearSBValueNonAddressableBits.py
@@ -0,0 +1,59 @@
+"""Test that SBValue clears non-addressable bits"""
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class TestClearSBValueNonAddressableBits(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    # On AArch64 systems, the top bits that are not used for
+    # addressing may be used for TBI, MTE, and/or pointer
+    # authentication.
+    @skipIf(archs=no_match(["aarch64", "arm64", "arm64e"]))
+
+    # Only run this test on systems where TBI is known to be
+    # enabled, so the address mask will clear the TBI bits.
+    @skipUnlessPlatform(["linux"] + lldbplatformutil.getDarwinOSTriples())
+    def test(self):
+        self.source = "main.c"
+        self.build()
+        (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
+            self, "break here", lldb.SBFileSpec(self.source, False)
+        )
+
+        if self.TraceOn():
+            self.runCmd("frame variable")
+            self.runCmd("frame variable &count &global")
+
+        frame = thread.GetFrameAtIndex(0)
+
+        count_p = frame.FindVariable("count_p")
+        count_invalid_p = frame.FindVariable("count_invalid_p")
+        self.assertEqual(
+            count_p.GetValueAsUnsigned(), count_invalid_p.GetValueAsAddress()
+        )
+        self.assertNotEqual(
+            count_invalid_p.GetValueAsUnsigned(), count_invalid_p.GetValueAsAddress()
+        )
+        self.assertEqual(5, count_p.Dereference().GetValueAsUnsigned())
+        self.assertEqual(5, count_invalid_p.Dereference().GetValueAsUnsigned())
+
+        global_p = frame.FindVariable("global_p")
+        global_invalid_p = frame.FindVariable("global_invalid_p")
+        self.assertEqual(
+            global_p.GetValueAsUnsigned(), global_invalid_p.GetValueAsAddress()
+        )
+        self.assertNotEqual(
+            global_invalid_p.GetValueAsUnsigned(), global_invalid_p.GetValueAsAddress()
+        )
+        self.assertEqual(10, global_p.Dereference().GetValueAsUnsigned())
+        self.assertEqual(10, global_invalid_p.Dereference().GetValueAsUnsigned())
+
+        main_p = frame.FindVariable("main_p")
+        main_invalid_p = frame.FindVariable("main_invalid_p")
+        self.assertEqual(
+            main_p.GetValueAsUnsigned(), main_invalid_p.GetValueAsAddress()
+        )
diff --git a/lldb/test/API/clear-sbvalue-nonaddressable-bits/main.c b/lldb/test/API/clear-sbvalue-nonaddressable-bits/main.c
new file mode 100644
index 000000000000..1b0e42c50dd6
--- /dev/null
+++ b/lldb/test/API/clear-sbvalue-nonaddressable-bits/main.c
@@ -0,0 +1,27 @@
+#include <stdint.h>
+
+int global = 10;
+
+int main() {
+  int count = 5;
+  int *count_p = &count;
+
+  // Add some metadata in the top byte (this will crash unless the
+  // test is running with TBI enabled, but we won't dereference it)
+
+  intptr_t scratch = (intptr_t)count_p;
+  scratch |= (3ULL << 60);
+  int *count_invalid_p = (int *)scratch;
+
+  int (*main_p)() = main;
+  scratch = (intptr_t)main_p;
+  scratch |= (3ULL << 60);
+  int (*main_invalid_p)() = (int (*)())scratch;
+
+  int *global_p = &global;
+  scratch = (intptr_t)global_p;
+  scratch |= (3ULL << 60);
+  int *global_invalid_p = (int *)scratch;
+
+  return count; // break here
+}
diff --git a/lldb/test/API/functionalities/diagnostic_reporting/TestDiagnosticReporting.py b/lldb/test/API/functionalities/diagnostic_reporting/TestDiagnosticReporting.py
index 6353e3e8cbed..36a3be695628 100644
--- a/lldb/test/API/functionalities/diagnostic_reporting/TestDiagnosticReporting.py
+++ b/lldb/test/API/functionalities/diagnostic_reporting/TestDiagnosticReporting.py
@@ -15,7 +15,7 @@ class TestDiagnosticReporting(TestBase):
         self.broadcaster = self.dbg.GetBroadcaster()
         self.listener = lldbutil.start_listening_from(
             self.broadcaster,
-            lldb.eBroadcastBitWarning | lldb.eBroadcastBitError,
+            lldb.SBDebugger.eBroadcastBitWarning | lldb.SBDebugger.eBroadcastBitError,
         )
 
     def test_dwarf_symbol_loading_diagnostic_report(self):
diff --git a/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py b/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py
index 2dcbb728549f..dd9500c186b2 100644
--- a/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py
+++ b/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py
@@ -48,6 +48,8 @@ class TestConcurrentVFork(TestBase):
         self.expect("continue", patterns=[r"exited with status = 1[0-4]"])
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_parent_vfork_no_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-parent.
@@ -56,6 +58,8 @@ class TestConcurrentVFork(TestBase):
         self.follow_parent_helper(use_fork=False, call_exec=False)
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_parent_fork_no_exec(self):
         """
         Make sure that debugging concurrent fork() from multiple threads won't crash lldb during follow-parent.
@@ -64,6 +68,8 @@ class TestConcurrentVFork(TestBase):
         self.follow_parent_helper(use_fork=True, call_exec=False)
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_parent_vfork_call_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-parent.
@@ -72,6 +78,8 @@ class TestConcurrentVFork(TestBase):
         self.follow_parent_helper(use_fork=False, call_exec=True)
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_parent_fork_call_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-parent.
@@ -80,6 +88,8 @@ class TestConcurrentVFork(TestBase):
         self.follow_parent_helper(use_fork=True, call_exec=True)
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_child_vfork_no_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-child.
@@ -88,6 +98,8 @@ class TestConcurrentVFork(TestBase):
         self.follow_child_helper(use_fork=False, call_exec=False)
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_child_fork_no_exec(self):
         """
         Make sure that debugging concurrent fork() from multiple threads won't crash lldb during follow-child.
@@ -96,6 +108,8 @@ class TestConcurrentVFork(TestBase):
         self.follow_child_helper(use_fork=True, call_exec=False)
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_child_vfork_call_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-child.
@@ -104,6 +118,8 @@ class TestConcurrentVFork(TestBase):
         self.follow_child_helper(use_fork=False, call_exec=True)
 
     @skipUnlessPlatform(["linux"])
+    # https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_child_fork_call_exec(self):
         """
         Make sure that debugging concurrent fork() from multiple threads won't crash lldb during follow-child.
diff --git a/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py b/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py
index 98988d7624da..9af53845ca1b 100644
--- a/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py
+++ b/lldb/test/API/functionalities/progress_reporting/TestProgressReporting.py
@@ -13,7 +13,7 @@ class TestProgressReporting(TestBase):
         TestBase.setUp(self)
         self.broadcaster = self.dbg.GetBroadcaster()
         self.listener = lldbutil.start_listening_from(
-            self.broadcaster, lldb.eBroadcastBitProgress
+            self.broadcaster, lldb.SBDebugger.eBroadcastBitProgress
         )
 
     def test_dwarf_symbol_loading_progress_report(self):
diff --git a/lldb/test/API/functionalities/progress_reporting/clang_modules/TestClangModuleBuildProgress.py b/lldb/test/API/functionalities/progress_reporting/clang_modules/TestClangModuleBuildProgress.py
index 33c7c269c081..228f676aedf6 100644
--- a/lldb/test/API/functionalities/progress_reporting/clang_modules/TestClangModuleBuildProgress.py
+++ b/lldb/test/API/functionalities/progress_reporting/clang_modules/TestClangModuleBuildProgress.py
@@ -34,7 +34,7 @@ class TestCase(TestBase):
         # other unrelated progress events.
         broadcaster = self.dbg.GetBroadcaster()
         listener = lldbutil.start_listening_from(
-            broadcaster, lldb.eBroadcastBitProgress
+            broadcaster, lldb.SBDebugger.eBroadcastBitProgress
         )
 
         # Trigger module builds.
diff --git a/lldb/test/API/macosx/rosetta/TestRosetta.py b/lldb/test/API/macosx/rosetta/TestRosetta.py
index 669db95a1624..ce40de475ef1 100644
--- a/lldb/test/API/macosx/rosetta/TestRosetta.py
+++ b/lldb/test/API/macosx/rosetta/TestRosetta.py
@@ -49,7 +49,7 @@ class TestRosetta(TestBase):
         if rosetta_debugserver_installed():
             broadcaster = self.dbg.GetBroadcaster()
             listener = lldbutil.start_listening_from(
-                broadcaster, lldb.eBroadcastBitWarning
+                broadcaster, lldb.SBDebugger.eBroadcastBitWarning
             )
 
             target, process, thread, bkpt = lldbutil.run_to_source_breakpoint(
diff --git a/lldb/test/Shell/Unwind/eh-frame-dwarf-unwind-abort.test b/lldb/test/Shell/Unwind/eh-frame-dwarf-unwind-abort.test
index 477a656a711f..d5e66ca5e263 100644
--- a/lldb/test/Shell/Unwind/eh-frame-dwarf-unwind-abort.test
+++ b/lldb/test/Shell/Unwind/eh-frame-dwarf-unwind-abort.test
@@ -9,12 +9,12 @@ process launch
 # CHECK: stop reason = signal SIGTRAP
 
 thread backtrace
-# CHECK: frame #0: {{.*}}`asm_main + 23
+# CHECK: frame #0: {{.*}}`asm_main + 19
 # CHECK: frame #1: {{.*}}`main + {{.*}}
 
 target modules show-unwind -n asm_main
 # CHECK: eh_frame UnwindPlan:
 # CHECK: row[0]:    0: CFA=rsp +8 => rip=[CFA-8]
-# CHECK: row[1]:   14: CFA=rsp+16 => rbp=[CFA-16] rip=[CFA-8]
-# CHECK: row[2]:   17: CFA=rbp+16 => rbp=[CFA-16] rip=[CFA-8]
-# CHECK: row[3]:   22: CFA=rsp +8 => rip=[CFA-8]
+# CHECK: row[1]:   10: CFA=rsp+16 => rbp=[CFA-16] rip=[CFA-8]
+# CHECK: row[2]:   13: CFA=rbp+16 => rbp=[CFA-16] rip=[CFA-8]
+# CHECK: row[3]:   18: CFA=rsp +8 => rip=[CFA-8]
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index d0fbb9155715..8000d68dea7e 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -421,8 +421,8 @@ void SendStdOutStdErr(lldb::SBProcess &process) {
 
 void ProgressEventThreadFunction() {
   lldb::SBListener listener("lldb-dap.progress.listener");
-  g_dap.debugger.GetBroadcaster().AddListener(listener,
-                                              lldb::eBroadcastBitProgress);
+  g_dap.debugger.GetBroadcaster().AddListener(
+      listener, lldb::SBDebugger::eBroadcastBitProgress);
   g_dap.broadcaster.AddListener(listener, eBroadcastBitStopProgressThread);
   lldb::SBEvent event;
   bool done = false;
diff --git a/lldb/unittests/Host/HostTest.cpp b/lldb/unittests/Host/HostTest.cpp
index 5e01a6835c03..a1d8a3b7f485 100644
--- a/lldb/unittests/Host/HostTest.cpp
+++ b/lldb/unittests/Host/HostTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Host/Host.h"
+#include "lldb/Utility/ProcessInfo.h"
 #include "gtest/gtest.h"
 
 using namespace lldb_private;
@@ -25,3 +26,23 @@ TEST(Host, GetEnvironment) {
   ASSERT_EQ("Host::GetEnvironment",
             Host::GetEnvironment().lookup("LLDB_TEST_ENVIRONMENT_VAR"));
 }
+
+TEST(Host, ProcessInstanceInfoCumulativeUserTimeIsValid) {
+  ProcessInstanceInfo info;
+  info.SetCumulativeUserTime(ProcessInstanceInfo::timespec{0, 0});
+  EXPECT_FALSE(info.CumulativeUserTimeIsValid());
+  info.SetCumulativeUserTime(ProcessInstanceInfo::timespec{0, 1});
+  EXPECT_TRUE(info.CumulativeUserTimeIsValid());
+  info.SetCumulativeUserTime(ProcessInstanceInfo::timespec{1, 0});
+  EXPECT_TRUE(info.CumulativeUserTimeIsValid());
+}
+
+TEST(Host, ProcessInstanceInfoCumulativeSystemTimeIsValid) {
+  ProcessInstanceInfo info;
+  info.SetCumulativeSystemTime(ProcessInstanceInfo::timespec{0, 0});
+  EXPECT_FALSE(info.CumulativeSystemTimeIsValid());
+  info.SetCumulativeSystemTime(ProcessInstanceInfo::timespec{0, 1});
+  EXPECT_TRUE(info.CumulativeSystemTimeIsValid());
+  info.SetCumulativeSystemTime(ProcessInstanceInfo::timespec{1, 0});
+  EXPECT_TRUE(info.CumulativeSystemTimeIsValid());
+}
+\ No newline at end of file
diff --git a/llvm/cmake/modules/llvm-driver-template.cpp.in b/llvm/cmake/modules/llvm-driver-template.cpp.in
index 71aca6cd140c..1470ef1f0616 100644
--- a/llvm/cmake/modules/llvm-driver-template.cpp.in
+++ b/llvm/cmake/modules/llvm-driver-template.cpp.in
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/LLVMDriver.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/LLVMDriver.h"
 
 int @TOOL_NAME@_main(int argc, char **, const llvm::ToolContext &);
 
diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst
index 57d6280d57c8..a62acfc8fdcd 100644
--- a/llvm/docs/CommandGuide/llvm-objcopy.rst
+++ b/llvm/docs/CommandGuide/llvm-objcopy.rst
@@ -366,12 +366,12 @@ them.
 
 .. option:: --keep-global-symbol <symbol>, -G
 
- Make all symbols local in the output, except for symbols with the name
+ Mark all symbols local in the output, except for symbols with the name
  ``<symbol>``. Can be specified multiple times to ignore multiple symbols.
 
 .. option:: --keep-global-symbols <filename>
 
- Make all symbols local in the output, except for symbols named in the file
+ Mark all symbols local in the output, except for symbols named in the file
  ``<filename>``. In the file, each line represents a single symbol, with leading
  and trailing whitespace ignored, as is anything following a '#'. Can be
  specified multiple times to read names from multiple files.
@@ -395,7 +395,7 @@ them.
 
 .. option:: --localize-hidden
 
- Make all symbols with hidden or internal visibility local in the output.
+ Mark all symbols with hidden or internal visibility local in the output.
 
 .. option:: --localize-symbol <symbol>, -L
 
diff --git a/llvm/docs/ConvergenceAndUniformity.rst b/llvm/docs/ConvergenceAndUniformity.rst
index 0e97595508f9..863cebd91a20 100644
--- a/llvm/docs/ConvergenceAndUniformity.rst
+++ b/llvm/docs/ConvergenceAndUniformity.rst
@@ -10,34 +10,61 @@ Convergence And Uniformity
 Introduction
 ============
 
-Some parallel environments execute threads in groups that allow
-communication within the group using special primitives called
-*convergent* operations. The outcome of a convergent operation is
-sensitive to the set of threads that executes it "together", i.e.,
-convergently.
-
-A value is said to be *uniform* across a set of threads if it is the
-same across those threads, and *divergent* otherwise. Correspondingly,
-a branch is said to be a uniform branch if its condition is uniform,
-and it is a divergent branch otherwise.
-
-Whether threads are *converged* or not depends on the paths they take
-through the control flow graph. Threads take different outgoing edges
-at a *divergent branch*. Divergent branches constrain
+In some environments, groups of threads execute the same program in parallel,
+where efficient communication within a group is established using special
+primitives called :ref:`convergent operations<convergent_operations>`. The
+outcome of a convergent operation is sensitive to the set of threads that
+participate in it.
+
+The intuitive picture of *convergence* is built around threads executing in
+"lock step" --- a set of threads is thought of as *converged* if they are all
+executing "the same sequence of instructions together". Such threads may
+*diverge* at a *divergent branch*, and they may later *reconverge* at some
+common program point.
+
+In this intuitive picture, when converged threads execute an instruction, the
+resulting value is said to be *uniform* if it is the same in those threads, and
+*divergent* otherwise. Correspondingly, a branch is said to be a uniform branch
+if its condition is uniform, and it is a divergent branch otherwise.
+
+But the assumption of lock-step execution is not necessary for describing
+communication at convergent operations. It also constrains the implementation
+(compiler as well as hardware) by overspecifying how threads execute in such a
+parallel environment. To eliminate this assumption:
+
+- We define convergence as a relation between the execution of each instruction
+  by different threads and not as a relation between the threads themselves.
+  This definition is reasonable for known targets and is compatible with the
+  semantics of :ref:`convergent operations<convergent_operations>` in LLVM IR.
+- We also define uniformity in terms of this convergence. The output of an
+  instruction can be examined for uniformity across multiple threads only if the
+  corresponding executions of that instruction are converged.
+
+This document decribes a static analysis for determining convergence at each
+instruction in a function. The analysis extends previous work on divergence
+analysis [DivergenceSPMD]_ to cover irreducible control-flow. The described
+analysis is used in LLVM to implement a UniformityAnalysis that determines the
+uniformity of value(s) computed at each instruction in an LLVM IR or MIR
+function.
+
+.. [DivergenceSPMD] Julian Rosemann, Simon Moll, and Sebastian
+   Hack. 2021. An Abstract Interpretation for SPMD Divergence on
+   Reducible Control Flow Graphs. Proc. ACM Program. Lang. 5, POPL,
+   Article 31 (January 2021), 35 pages.
+   https://doi.org/10.1145/3434312
+
+Motivation
+==========
+
+Divergent branches constrain
 program transforms such as changing the CFG or moving a convergent
 operation to a different point of the CFG. Performing these
 transformations across a divergent branch can change the sets of
 threads that execute convergent operations convergently. While these
-constraints are out of scope for this document, the described
-*uniformity analysis* allows these transformations to identify
+constraints are out of scope for this document,
+uniformity analysis allows these transformations to identify
 uniform branches where these constraints do not hold.
 
-Convergence and
-uniformity are inter-dependent: When threads diverge at a divergent
-branch, they may later *reconverge* at a common program point.
-Subsequent operations are performed convergently, but the inputs may
-be non-uniform, thus producing divergent outputs.
-
 Uniformity is also useful by itself on targets that execute threads in
 groups with shared execution resources (e.g. waves, warps, or
 subgroups):
@@ -50,18 +77,6 @@ subgroups):
   branches, since the whole group of threads follows either one side
   of the branch or the other.
 
-This document presents a definition of convergence that is reasonable
-for real targets and is compatible with the currently implicit
-semantics of convergent operations in LLVM IR. This is accompanied by
-a *uniformity analysis* that extends previous work on divergence analysis
-[DivergenceSPMD]_ to cover irreducible control-flow.
-
-.. [DivergenceSPMD] Julian Rosemann, Simon Moll, and Sebastian
-   Hack. 2021. An Abstract Interpretation for SPMD Divergence on
-   Reducible Control Flow Graphs. Proc. ACM Program. Lang. 5, POPL,
-   Article 31 (January 2021), 35 pages.
-   https://doi.org/10.1145/3434312
-
 Terminology
 ===========
 
@@ -133,12 +148,6 @@ meaning. Dynamic instances listed in the same column are converged.
 Convergence
 ===========
 
-*Converged-with* is a transitive symmetric relation over dynamic
-instances produced by *different threads* for the *same static
-instance*. Informally, two threads that produce converged dynamic
-instances are said to be *converged*, and they are said to execute
-that static instance *convergently*, at that point in the execution.
-
 *Convergence-before* is a strict partial order over dynamic instances
 that is defined as the transitive closure of:
 
@@ -171,11 +180,16 @@ to be converged (i.e., related to each other in the converged-with
 relation). The resulting convergence order includes the edges ``P ->
 Q2``, ``Q1 -> R``, ``P -> R``, ``P -> T``, etc.
 
-The fact that *convergence-before* is a strict partial order is a
-constraint on the *converged-with* relation. It is trivially satisfied
-if different dynamic instances are never converged. It is also
-trivially satisfied for all known implementations for which
-convergence plays some role.
+*Converged-with* is a transitive symmetric relation over dynamic instances
+produced by *different threads* for the *same static instance*.
+
+It is impractical to provide any one definition for the *converged-with*
+relation, since different environments may wish to relate dynamic instances in
+different ways. The fact that *convergence-before* is a strict partial order is
+a constraint on the *converged-with* relation. It is trivially satisfied if
+different dynamic instances are never converged. Below, we provide a relation
+called :ref:`maximal converged-with<convergence-maximal>`, which satisifies
+*convergence-before* and is suitable for known targets.
 
 .. _convergence-note-convergence:
 
@@ -217,14 +231,16 @@ iterations of parent cycles as well.
 
    Dynamic instances ``X1`` and ``X2`` produced by different threads
    for the same static instance ``X`` are converged in the maximal
-   converged-with relation if and only if for every cycle ``C`` with
-   header ``H`` that contains ``X``:
-
-   - every dynamic instance ``H1`` of ``H`` that precedes ``X1`` in
-     the respective thread is convergence-before ``X2``, and,
-   - every dynamic instance ``H2`` of ``H`` that precedes ``X2`` in
-     the respective thread is convergence-before ``X1``,
-   - without assuming that ``X1`` is converged with ``X2``.
+   converged-with relation if and only if:
+
+   - ``X`` is not contained in any cycle, or,
+   - For every cycle ``C`` with header ``H`` that contains ``X``:
+
+     - every dynamic instance ``H1`` of ``H`` that precedes ``X1`` in
+       the respective thread is convergence-before ``X2``, and,
+     - every dynamic instance ``H2`` of ``H`` that precedes ``X2`` in
+       the respective thread is convergence-before ``X1``,
+     - without assuming that ``X1`` is converged with ``X2``.
 
 .. note::
 
diff --git a/llvm/docs/ConvergentOperations.rst b/llvm/docs/ConvergentOperations.rst
index 332675f3edef..5081efffc89a 100644
--- a/llvm/docs/ConvergentOperations.rst
+++ b/llvm/docs/ConvergentOperations.rst
@@ -936,7 +936,8 @@ property <uniformity-analysis>` of static instances in the convergence region of
      1. Both threads executed converged dynamic instances of every token
         definition ``D`` such that ``X`` is in the convergence region of ``D``,
         and,
-     2. For every cycle ``C`` with header ``H`` that contains ``X``:
+     2. Either ``X`` is not contained in any cycle, or, for every cycle ``C``
+        with header ``H`` that contains ``X``:
 
         - every dynamic instance ``H1`` of ``H`` that precedes ``X1`` in the
           respective thread is convergence-before ``X2``, and,
diff --git a/llvm/docs/GlobalISel/MIRPatterns.rst b/llvm/docs/GlobalISel/MIRPatterns.rst
index 728e32470144..d7dce1b978cd 100644
--- a/llvm/docs/GlobalISel/MIRPatterns.rst
+++ b/llvm/docs/GlobalISel/MIRPatterns.rst
@@ -514,3 +514,40 @@ of operands.
     (match (does_not_bind $tmp, $x)
            (G_MUL $dst, $x, $tmp)),
     (apply (COPY $dst, $x))>;
+
+
+
+
+Gallery
+=======
+
+We should use precise patterns that state our intentions. Please avoid
+using wip_match_opcode in patterns.
+
+.. code-block:: text
+  :caption: Example fold zext(trunc:nuw)
+
+  // Imprecise: matches any G_ZEXT
+  def zext : GICombineRule<
+    (defs root:$root),
+    (match (wip_match_opcode G_ZEXT):$root,
+    [{ return Helper.matchZextOfTrunc(*${root}, ${matchinfo}); }]),
+    (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
+
+  // Imprecise: matches G_ZEXT of G_TRUNC
+  def zext_of_trunc : GICombineRule<
+    (defs root:$root),
+    (match (G_TRUNC $src, $x),
+           (G_ZEXT $root, $src),
+    [{ return Helper.matchZextOfTrunc(${root}, ${matchinfo}); }]),
+    (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;
+
+
+  // Precise: matches G_ZEXT of G_TRUNC with nuw flag
+  def zext_of_trunc_nuw : GICombineRule<
+    (defs root:$root),
+    (match (G_TRUNC $src, $x, (MIFlags NoUWrap)),
+           (G_ZEXT $root, $src),
+    [{ return Helper.matchZextOfTrunc(${root}, ${matchinfo}); }]),
+    (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index f169ab941c45..37662f79145d 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -18805,7 +18805,7 @@ runtime, then the result vector is a :ref:`poison value <poisonvalues>`. The
 ``idx`` parameter must be a vector index constant type (for most targets this
 will be an integer pointer type).
 
-'``llvm.experimental.vector.reverse``' Intrinsic
+'``llvm.vector.reverse``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
@@ -18814,25 +18814,26 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8> %a)
-      declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+      declare <2 x i8> @llvm.vector.reverse.v2i8(<2 x i8> %a)
+      declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.reverse.*``' intrinsics reverse a vector.
+The '``llvm.vector.reverse.*``' intrinsics reverse a vector.
 The intrinsic takes a single vector and returns a vector of matching type but
 with the original lane order reversed. These intrinsics work for both fixed
-and scalable vectors. While this intrinsic is marked as experimental the
-recommended way to express reverse operations for fixed-width vectors is still
-to use a shufflevector, as that may allow for more optimization opportunities.
+and scalable vectors. While this intrinsic supports all vector types
+the recommended way to express this operation for fixed-width vectors is
+still to use a shufflevector, as that may allow for more optimization
+opportunities.
 
 Arguments:
 """"""""""
 
 The argument to this intrinsic must be a vector.
 
-'``llvm.experimental.vector.deinterleave2``' Intrinsic
+'``llvm.vector.deinterleave2``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
@@ -18841,13 +18842,13 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec1)
-      declare {<vscale x 4 x i32>, <vscale x 4 x i32>}  @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec1)
+      declare {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double> %vec1)
+      declare {<vscale x 4 x i32>, <vscale x 4 x i32>}  @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec1)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.deinterleave2``' intrinsic constructs two
+The '``llvm.vector.deinterleave2``' intrinsic constructs two
 vectors by deinterleaving the even and odd lanes of the input vector.
 
 This intrinsic works for both fixed and scalable vectors. While this intrinsic
@@ -18859,7 +18860,7 @@ For example:
 
 .. code-block:: text
 
-  {<2 x i64>, <2 x i64>} llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> <i64 0, i64 1, i64 2, i64 3>); ==> {<2 x i64> <i64 0, i64 2>, <2 x i64> <i64 1, i64 3>}
+  {<2 x i64>, <2 x i64>} llvm.vector.deinterleave2.v4i64(<4 x i64> <i64 0, i64 1, i64 2, i64 3>); ==> {<2 x i64> <i64 0, i64 2>, <2 x i64> <i64 1, i64 3>}
 
 Arguments:
 """"""""""
@@ -18867,7 +18868,7 @@ Arguments:
 The argument is a vector whose type corresponds to the logical concatenation of
 the two result types.
 
-'``llvm.experimental.vector.interleave2``' Intrinsic
+'``llvm.vector.interleave2``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
@@ -18876,13 +18877,13 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %vec1, <2 x double> %vec2)
-      declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2)
+      declare <4 x double> @llvm.vector.interleave2.v4f64(<2 x double> %vec1, <2 x double> %vec2)
+      declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.interleave2``' intrinsic constructs a vector
+The '``llvm.vector.interleave2``' intrinsic constructs a vector
 by interleaving two input vectors.
 
 This intrinsic works for both fixed and scalable vectors. While this intrinsic
@@ -18894,7 +18895,7 @@ For example:
 
 .. code-block:: text
 
-   <4 x i64> llvm.experimental.vector.interleave2.v4i64(<2 x i64> <i64 0, i64 2>, <2 x i64> <i64 1, i64 3>); ==> <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+   <4 x i64> llvm.vector.interleave2.v4i64(<2 x i64> <i64 0, i64 2>, <2 x i64> <i64 1, i64 3>); ==> <4 x i64> <i64 0, i64 1, i64 2, i64 3>
 
 Arguments:
 """"""""""
@@ -18940,7 +18941,7 @@ The '``llvm.experimental.cttz.elts``' intrinsic counts the trailing (least
 significant) zero elements in a vector. If ``src == 0`` the result is the
 number of elements in the input vector.
 
-'``llvm.experimental.vector.splice``' Intrinsic
+'``llvm.vector.splice``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
@@ -18949,13 +18950,13 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %vec1, <2 x double> %vec2, i32 %imm)
-      declare <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2, i32 %imm)
+      declare <2 x double> @llvm.vector.splice.v2f64(<2 x double> %vec1, <2 x double> %vec2, i32 %imm)
+      declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2, i32 %imm)
 
 Overview:
 """""""""
 
-The '``llvm.experimental.vector.splice.*``' intrinsics construct a vector by
+The '``llvm.vector.splice.*``' intrinsics construct a vector by
 concatenating elements from the first input vector with elements of the second
 input vector, returning a vector of the same type as the input vectors. The
 signed immediate, modulo the number of elements in the vector, is the index
@@ -18966,7 +18967,7 @@ immediate, it extracts ``-imm`` trailing elements from the first vector, and
 the remaining elements from ``%vec2``.
 
 These intrinsics work for both fixed and scalable vectors. While this intrinsic
-is marked as experimental, the recommended way to express this operation for
+supports all vector types the recommended way to express this operation for
 fixed-width vectors is still to use a shufflevector, as that may allow for more
 optimization opportunities.
 
@@ -18974,8 +18975,8 @@ For example:
 
 .. code-block:: text
 
- llvm.experimental.vector.splice(<A,B,C,D>, <E,F,G,H>, 1);  ==> <B, C, D, E> index
- llvm.experimental.vector.splice(<A,B,C,D>, <E,F,G,H>, -3); ==> <B, C, D, E> trailing elements
+ llvm.vector.splice(<A,B,C,D>, <E,F,G,H>, 1);  ==> <B, C, D, E> index
+ llvm.vector.splice(<A,B,C,D>, <E,F,G,H>, -3); ==> <B, C, D, E> trailing elements
 
 
 Arguments:
@@ -22198,7 +22199,7 @@ Overview:
 """""""""
 
 The '``llvm.experimental.vp.splice.*``' intrinsic is the vector length
-predicated version of the '``llvm.experimental.vector.splice.*``' intrinsic.
+predicated version of the '``llvm.vector.splice.*``' intrinsic.
 
 Arguments:
 """"""""""
@@ -22257,7 +22258,7 @@ Overview:
 """""""""
 
 The '``llvm.experimental.vp.reverse.*``' intrinsic is the vector length
-predicated version of the '``llvm.experimental.vector.reverse.*``' intrinsic.
+predicated version of the '``llvm.vector.reverse.*``' intrinsic.
 
 Arguments:
 """"""""""
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 6ef6ec20da67..46d79d6c5822 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -50,7 +50,11 @@ Update on required toolchains to build LLVM
 Changes to the LLVM IR
 ----------------------
 
-- Added Memory Model Relaxation Annotations (MMRAs).
+* Added Memory Model Relaxation Annotations (MMRAs).
+* Renamed ``llvm.experimental.vector.reverse`` intrinsic to ``llvm.vector.reverse``.
+* Renamed ``llvm.experimental.vector.splice`` intrinsic to ``llvm.vector.splice``.
+* Renamed ``llvm.experimental.vector.interleave2`` intrinsic to ``llvm.vector.interleave2``.
+* Renamed ``llvm.experimental.vector.deinterleave2`` intrinsic to ``llvm.vector.deinterleave2``.
 
 Changes to LLVM infrastructure
 ------------------------------
@@ -69,7 +73,8 @@ Changes to Interprocedural Optimizations
 Changes to the AArch64 Backend
 ------------------------------
 
-* Added support for Cortex-A78AE, Cortex-A520AE and Cortex-A720AE CPUs.
+* Added support for Cortex-A78AE, Cortex-A520AE, Cortex-A720AE,
+  Neoverse-N3, Neoverse-V3 and Neoverse-V3AE CPUs.
 
 Changes to the AMDGPU Backend
 -----------------------------
@@ -112,6 +117,8 @@ Changes to the RISC-V Backend
 * The experimental Ssqosid extension is supported.
 * Zacas is no longer experimental.
 * Added the CSR names from the Resumable Non-Maskable Interrupts (Smrnmi) extension.
+* llvm-objdump now prints disassembled opcode bytes in groups of 2 or 4 bytes to
+  match GNU objdump. The bytes within the groups are in big endian order.
 
 Changes to the WebAssembly Backend
 ----------------------------------
diff --git a/llvm/include/llvm-c/DebugInfo.h b/llvm/include/llvm-c/DebugInfo.h
index dab1d697761b..2c3c75e246c0 100644
--- a/llvm/include/llvm-c/DebugInfo.h
+++ b/llvm/include/llvm-c/DebugInfo.h
@@ -125,7 +125,20 @@ typedef enum {
   LLVMDWARFSourceLanguageFortran18,
   LLVMDWARFSourceLanguageAda2005,
   LLVMDWARFSourceLanguageAda2012,
+  LLVMDWARFSourceLanguageHIP,
+  LLVMDWARFSourceLanguageAssembly,
+  LLVMDWARFSourceLanguageC_sharp,
   LLVMDWARFSourceLanguageMojo,
+  LLVMDWARFSourceLanguageGLSL,
+  LLVMDWARFSourceLanguageGLSL_ES,
+  LLVMDWARFSourceLanguageHLSL,
+  LLVMDWARFSourceLanguageOpenCL_CPP,
+  LLVMDWARFSourceLanguageCPP_for_OpenCL,
+  LLVMDWARFSourceLanguageSYCL,
+  LLVMDWARFSourceLanguageRuby,
+  LLVMDWARFSourceLanguageMove,
+  LLVMDWARFSourceLanguageHylo,
+
   // Vendor extensions:
   LLVMDWARFSourceLanguageMips_Assembler,
   LLVMDWARFSourceLanguageGOOGLE_RenderScript,
diff --git a/llvm/include/llvm/ADT/StringRef.h b/llvm/include/llvm/ADT/StringRef.h
index 0360174c5231..04496c76e072 100644
--- a/llvm/include/llvm/ADT/StringRef.h
+++ b/llvm/include/llvm/ADT/StringRef.h
@@ -258,6 +258,9 @@ namespace llvm {
       return Length >= Prefix.Length &&
              compareMemory(Data, Prefix.Data, Prefix.Length) == 0;
     }
+    [[nodiscard]] bool starts_with(char Prefix) const {
+      return !empty() && front() == Prefix;
+    }
 
     /// Check if this string starts with the given \p Prefix, ignoring case.
     [[nodiscard]] bool starts_with_insensitive(StringRef Prefix) const;
@@ -268,6 +271,9 @@ namespace llvm {
              compareMemory(end() - Suffix.Length, Suffix.Data, Suffix.Length) ==
                  0;
     }
+    [[nodiscard]] bool ends_with(char Suffix) const {
+      return !empty() && back() == Suffix;
+    }
 
     /// Check if this string ends with the given \p Suffix, ignoring case.
     [[nodiscard]] bool ends_with_insensitive(StringRef Suffix) const;
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 571e44cdac26..afd18e7e56ba 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -860,7 +860,8 @@ enum class OverflowResult {
 };
 
 OverflowResult computeOverflowForUnsignedMul(const Value *LHS, const Value *RHS,
-                                             const SimplifyQuery &SQ);
+                                             const SimplifyQuery &SQ,
+                                             bool IsNSW = false);
 OverflowResult computeOverflowForSignedMul(const Value *LHS, const Value *RHS,
                                            const SimplifyQuery &SQ);
 OverflowResult
diff --git a/llvm/include/llvm/BinaryFormat/Dwarf.def b/llvm/include/llvm/BinaryFormat/Dwarf.def
index 460a9264536b..adcf24eb83b0 100644
--- a/llvm/include/llvm/BinaryFormat/Dwarf.def
+++ b/llvm/include/llvm/BinaryFormat/Dwarf.def
@@ -11,19 +11,20 @@
 //===----------------------------------------------------------------------===//
 
 // TODO: Add other DW-based macros.
-#if !(                                                                         \
-    defined HANDLE_DW_TAG || defined HANDLE_DW_AT || defined HANDLE_DW_FORM || \
-    defined HANDLE_DW_OP || defined HANDLE_DW_OP_LLVM_USEROP ||                    \
-    defined HANDLE_DW_LANG || defined HANDLE_DW_ATE ||                         \
-    defined HANDLE_DW_VIRTUALITY || defined HANDLE_DW_DEFAULTED ||             \
-    defined HANDLE_DW_CC || defined HANDLE_DW_LNS || defined HANDLE_DW_LNE ||  \
-    defined HANDLE_DW_LNCT || defined HANDLE_DW_MACRO ||                       \
-    defined HANDLE_DW_MACRO_GNU || defined HANDLE_MACRO_FLAG ||                \
-    defined HANDLE_DW_RLE || defined HANDLE_DW_LLE ||                          \
-    (defined HANDLE_DW_CFA && defined HANDLE_DW_CFA_PRED) ||                   \
-    defined HANDLE_DW_APPLE_PROPERTY || defined HANDLE_DW_UT ||                \
-    defined HANDLE_DWARF_SECTION || defined HANDLE_DW_IDX ||                   \
-    defined HANDLE_DW_END || defined HANDLE_DW_SECT)
+#if !(defined HANDLE_DW_TAG || defined HANDLE_DW_AT ||                         \
+      defined HANDLE_DW_FORM || defined HANDLE_DW_OP ||                        \
+      defined HANDLE_DW_OP_LLVM_USEROP || defined HANDLE_DW_LANG ||            \
+      defined HANDLE_DW_LNAME || defined HANDLE_DW_ATE ||                      \
+      defined HANDLE_DW_VIRTUALITY || defined HANDLE_DW_DEFAULTED ||           \
+      defined HANDLE_DW_CC || defined HANDLE_DW_LNS ||                         \
+      defined HANDLE_DW_LNE || defined HANDLE_DW_LNCT ||                       \
+      defined HANDLE_DW_MACRO || defined HANDLE_DW_MACRO_GNU ||                \
+      defined HANDLE_MACRO_FLAG || defined HANDLE_DW_RLE ||                    \
+      defined HANDLE_DW_LLE ||                                                 \
+      (defined HANDLE_DW_CFA && defined HANDLE_DW_CFA_PRED) ||                 \
+      defined HANDLE_DW_APPLE_PROPERTY || defined HANDLE_DW_UT ||              \
+      defined HANDLE_DWARF_SECTION || defined HANDLE_DW_IDX ||                 \
+      defined HANDLE_DW_END || defined HANDLE_DW_SECT)
 #error "Missing macro definition of HANDLE_DW*"
 #endif
 
@@ -61,6 +62,10 @@
 #define HANDLE_DW_LANG(ID, NAME, LOWER_BOUND, VERSION, VENDOR)
 #endif
 
+#ifndef HANDLE_DW_LNAME
+#define HANDLE_DW_LNAME(ID, NAME, DESC, LOWER_BOUND)
+#endif
+
 #ifndef HANDLE_DW_ATE
 #define HANDLE_DW_ATE(ID, NAME, VERSION, VENDOR)
 #endif
@@ -950,12 +955,81 @@ HANDLE_DW_LANG(0x002c, C17, 0, 0, DWARF)
 HANDLE_DW_LANG(0x002d, Fortran18, 0, 0, DWARF)
 HANDLE_DW_LANG(0x002e, Ada2005, 0, 0, DWARF)
 HANDLE_DW_LANG(0x002f, Ada2012, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0030, HIP, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0031, Assembly, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0032, C_sharp, 0, 0, DWARF)
 HANDLE_DW_LANG(0x0033, Mojo, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0034, GLSL, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0035, GLSL_ES, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0036, HLSL, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0037, OpenCL_CPP, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0038, CPP_for_OpenCL, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0039, SYCL, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0040, Ruby, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0041, Move, 0, 0, DWARF)
+HANDLE_DW_LANG(0x0042, Hylo, 0, 0, DWARF)
+
 // Vendor extensions:
 HANDLE_DW_LANG(0x8001, Mips_Assembler, std::nullopt, 0, MIPS)
 HANDLE_DW_LANG(0x8e57, GOOGLE_RenderScript, 0, 0, GOOGLE)
 HANDLE_DW_LANG(0xb000, BORLAND_Delphi, 0, 0, BORLAND)
 
+// Tentative DWARF 6 language codes. This list is subject to change.
+HANDLE_DW_LNAME(0x0001, Ada, "ISO Ada", 1) // YYYY
+HANDLE_DW_LNAME(0x0002, BLISS, "BLISS", 0)
+//     YYYYMM
+// K&R 000000
+// C89 198912
+// C99 199901
+// C11 201112
+// C17 201710
+// C23 202311
+HANDLE_DW_LNAME(0x0003, C, "C (K&R and ISO)", 0)
+//       YYYYMM
+// C++98 199711
+// C++03 200310
+// C++11 201103
+// C++14 201402
+// C++17 201703
+// C++20 202002
+HANDLE_DW_LNAME(0x0004, C_plus_plus, "ISO C++", 0)
+HANDLE_DW_LNAME(0x0005, Cobol, "ISO Cobol", 1) // YYYY
+HANDLE_DW_LNAME(0x0006, Crystal, "Crystal", 0)
+HANDLE_DW_LNAME(0x0007, D, "D", 0)
+HANDLE_DW_LNAME(0x0008, Dylan, "Dylan", 0)
+HANDLE_DW_LNAME(0x0009, Fortran, "ISO Fortran", 1) // YYYY
+HANDLE_DW_LNAME(0x000a, Go, "Go", 0)
+HANDLE_DW_LNAME(0x000b, Haskell, "Haskell", 0)
+HANDLE_DW_LNAME(0x000c, Java, "Java", 0)
+HANDLE_DW_LNAME(0x000d, Julia, "Julia", 1)
+HANDLE_DW_LNAME(0x000e, Kotlin, "Kotlin", 0)
+HANDLE_DW_LNAME(0x000f, Modula2, "Modula 2", 1)
+HANDLE_DW_LNAME(0x0010, Modula3, "Modula 3", 1)
+HANDLE_DW_LNAME(0x0011, ObjC, "Objective C", 0) // YYYYMM
+HANDLE_DW_LNAME(0x0012, ObjC_plus_plus, "Objective C++", 0) // YYYYMM
+HANDLE_DW_LNAME(0x0013, OCaml, "OCaml", 0)
+HANDLE_DW_LNAME(0x0014, OpenCL_C, "OpenCL C", 0)
+HANDLE_DW_LNAME(0x0015, Pascal, "ISO Pascal", 1) // YYYY
+HANDLE_DW_LNAME(0x0016, PLI, "ANSI PL/I", 1)
+HANDLE_DW_LNAME(0x0017, Python, "Python", 0)
+HANDLE_DW_LNAME(0x0018, RenderScript, "RenderScript Kernel Language", 0)
+HANDLE_DW_LNAME(0x0019, Rust, "Rust", 0)
+HANDLE_DW_LNAME(0x001a, Swift, "Swift", 0) // VVMM
+HANDLE_DW_LNAME(0x001b, UPC, "Unified Parallel C (UPC)", 0)
+HANDLE_DW_LNAME(0x001c, Zig, "Zig", 0)
+HANDLE_DW_LNAME(0x001d, Assembly, "Assembly", 0)
+// Conflict: HANDLE_DW_LNAME(0x001d, HIP, "HIP", 0)
+HANDLE_DW_LNAME(0x001e, C_sharp, "C#", 0)
+HANDLE_DW_LNAME(0x001f, Mojo, "Mojo", 0)
+HANDLE_DW_LNAME(0x0020, GLSL, "OpenGL Shading Language", 0) // VVMMPP
+HANDLE_DW_LNAME(0x0021, GLSL_ES, "OpenGL ES Shading Language", 0) // VVMMPP
+HANDLE_DW_LNAME(0x0022, HLSL, "High Level Shading Language", 0) // YYYY
+HANDLE_DW_LNAME(0x0023, OpenCL_CPP, "OpenCL C++", 0) // VVMM
+HANDLE_DW_LNAME(0x0024, CPP_for_OpenCL, "C++ for OpenCL", 0) // VVMM
+HANDLE_DW_LNAME(0x0025, SYCL, "SYCL", 0) // YYYYRR
+HANDLE_DW_LNAME(0x0026, Ruby, "Ruby", 0) // VVMMPP
+HANDLE_DW_LNAME(0x0027, Move, "Move", 0) // YYYYMM
+HANDLE_DW_LNAME(0x0028, Hylo, "Hylo", 0)
 
 // DWARF attribute type encodings.
 HANDLE_DW_ATE(0x01, address, 2, DWARF)
@@ -1267,6 +1341,7 @@ HANDLE_DW_SECT(8, RNGLISTS)
 #undef HANDLE_DW_OP
 #undef HANDLE_DW_OP_LLVM_USEROP
 #undef HANDLE_DW_LANG
+#undef HANDLE_DW_LNAME
 #undef HANDLE_DW_ATE
 #undef HANDLE_DW_VIRTUALITY
 #undef HANDLE_DW_DEFAULTED
diff --git a/llvm/include/llvm/BinaryFormat/Dwarf.h b/llvm/include/llvm/BinaryFormat/Dwarf.h
index 298700c8941e..74c4d6ff3a71 100644
--- a/llvm/include/llvm/BinaryFormat/Dwarf.h
+++ b/llvm/include/llvm/BinaryFormat/Dwarf.h
@@ -209,6 +209,284 @@ enum SourceLanguage {
   DW_LANG_hi_user = 0xffff
 };
 
+enum SourceLanguageName : uint16_t {
+#define HANDLE_DW_LNAME(ID, NAME, DESC, LOWER_BOUND) DW_LNAME_##NAME = ID,
+#include "llvm/BinaryFormat/Dwarf.def"
+};
+
+/// Convert a DWARF 6 pair of language name and version to a DWARF 5 DW_LANG.
+/// If the version number doesn't exactly match a known version it is
+/// rounded up to the next-highest known version number.
+inline std::optional<SourceLanguage> toDW_LANG(SourceLanguageName name,
+                                               uint32_t version) {
+  switch (name) {
+  case DW_LNAME_Ada: // YYYY
+    if (version <= 1983)
+      return DW_LANG_Ada83;
+    if (version <= 1995)
+      return DW_LANG_Ada95;
+    if (version <= 2005)
+      return DW_LANG_Ada2005;
+    if (version <= 2012)
+      return DW_LANG_Ada2012;
+    return {};
+  case DW_LNAME_BLISS:
+    return DW_LANG_BLISS;
+  case DW_LNAME_C: // YYYYMM, K&R 000000
+    if (version == 0)
+      return DW_LANG_C;
+    if (version <= 198912)
+      return DW_LANG_C89;
+    if (version <= 199901)
+      return DW_LANG_C99;
+    if (version <= 201112)
+      return DW_LANG_C11;
+    if (version <= 201710)
+      return DW_LANG_C17;
+    return {};
+  case DW_LNAME_C_plus_plus: // YYYYMM
+    if (version == 0)
+      return DW_LANG_C_plus_plus;
+    if (version <= 199711)
+      return DW_LANG_C_plus_plus;
+    if (version <= 200310)
+      return DW_LANG_C_plus_plus_03;
+    if (version <= 201103)
+      return DW_LANG_C_plus_plus_11;
+    if (version <= 201402)
+      return DW_LANG_C_plus_plus_14;
+    if (version <= 201703)
+      return DW_LANG_C_plus_plus_17;
+    if (version <= 202002)
+      return DW_LANG_C_plus_plus_20;
+    return {};
+  case DW_LNAME_Cobol: // YYYY
+    if (version <= 1974)
+      return DW_LANG_Cobol74;
+    if (version <= 1985)
+      return DW_LANG_Cobol85;
+    return {};
+  case DW_LNAME_Crystal:
+    return DW_LANG_Crystal;
+  case DW_LNAME_D:
+    return DW_LANG_D;
+  case DW_LNAME_Dylan:
+    return DW_LANG_Dylan;
+  case DW_LNAME_Fortran: // YYYY
+    if (version <= 1977)
+      return DW_LANG_Fortran77;
+    if (version <= 1990)
+      return DW_LANG_Fortran90;
+    if (version <= 1995)
+      return DW_LANG_Fortran95;
+    if (version <= 2003)
+      return DW_LANG_Fortran03;
+    if (version <= 2008)
+      return DW_LANG_Fortran08;
+    if (version <= 2018)
+      return DW_LANG_Fortran18;
+    return {};
+  case DW_LNAME_Go:
+    return DW_LANG_Go;
+  case DW_LNAME_Haskell:
+    return DW_LANG_Haskell;
+  // case DW_LNAME_HIP:
+  //   return DW_LANG_HIP;
+  case DW_LNAME_Java:
+    return DW_LANG_Java;
+  case DW_LNAME_Julia:
+    return DW_LANG_Julia;
+  case DW_LNAME_Kotlin:
+    return DW_LANG_Kotlin;
+  case DW_LNAME_Modula2:
+    return DW_LANG_Modula2;
+  case DW_LNAME_Modula3:
+    return DW_LANG_Modula3;
+  case DW_LNAME_ObjC:
+    return DW_LANG_ObjC;
+  case DW_LNAME_ObjC_plus_plus:
+    return DW_LANG_ObjC_plus_plus;
+  case DW_LNAME_OCaml:
+    return DW_LANG_OCaml;
+  case DW_LNAME_OpenCL_C:
+    return DW_LANG_OpenCL;
+  case DW_LNAME_Pascal:
+    return DW_LANG_Pascal83;
+  case DW_LNAME_PLI:
+    return DW_LANG_PLI;
+  case DW_LNAME_Python:
+    return DW_LANG_Python;
+  case DW_LNAME_RenderScript:
+    return DW_LANG_RenderScript;
+  case DW_LNAME_Rust:
+    return DW_LANG_Rust;
+  case DW_LNAME_Swift:
+    return DW_LANG_Swift;
+  case DW_LNAME_UPC:
+    return DW_LANG_UPC;
+  case DW_LNAME_Zig:
+    return DW_LANG_Zig;
+  case DW_LNAME_Assembly:
+    return DW_LANG_Assembly;
+  case DW_LNAME_C_sharp:
+    return DW_LANG_C_sharp;
+  case DW_LNAME_Mojo:
+    return DW_LANG_Mojo;
+  case DW_LNAME_GLSL:
+    return DW_LANG_GLSL;
+  case DW_LNAME_GLSL_ES:
+    return DW_LANG_GLSL_ES;
+  case DW_LNAME_HLSL:
+    return DW_LANG_HLSL;
+  case DW_LNAME_OpenCL_CPP:
+    return DW_LANG_OpenCL_CPP;
+  case DW_LNAME_CPP_for_OpenCL:
+    return {};
+  case DW_LNAME_SYCL:
+    return DW_LANG_SYCL;
+  case DW_LNAME_Ruby:
+    return DW_LANG_Ruby;
+  case DW_LNAME_Move:
+    return DW_LANG_Move;
+  case DW_LNAME_Hylo:
+    return DW_LANG_Hylo;
+  }
+  return {};
+}
+
+/// Convert a DWARF 5 DW_LANG to a DWARF 6 pair of language name and version.
+inline std::optional<std::pair<SourceLanguageName, uint32_t>>
+toDW_LNAME(SourceLanguage language) {
+  switch (language) {
+  case DW_LANG_Ada83:
+    return {{DW_LNAME_Ada, 1983}};
+  case DW_LANG_Ada95:
+    return {{DW_LNAME_Ada, 1995}};
+  case DW_LANG_Ada2005:
+    return {{DW_LNAME_Ada, 2005}};
+  case DW_LANG_Ada2012:
+    return {{DW_LNAME_Ada, 2012}};
+  case DW_LANG_BLISS:
+    return {{DW_LNAME_BLISS, 0}};
+  case DW_LANG_C:
+    return {{DW_LNAME_C, 0}};
+  case DW_LANG_C89:
+    return {{DW_LNAME_C, 198912}};
+  case DW_LANG_C99:
+    return {{DW_LNAME_C, 199901}};
+  case DW_LANG_C11:
+    return {{DW_LNAME_C, 201112}};
+  case DW_LANG_C17:
+    return {{DW_LNAME_C, 201712}};
+  case DW_LANG_C_plus_plus:
+    return {{DW_LNAME_C_plus_plus, 0}};
+  case DW_LANG_C_plus_plus_03:
+    return {{DW_LNAME_C_plus_plus, 200310}};
+  case DW_LANG_C_plus_plus_11:
+    return {{DW_LNAME_C_plus_plus, 201103}};
+  case DW_LANG_C_plus_plus_14:
+    return {{DW_LNAME_C_plus_plus, 201402}};
+  case DW_LANG_C_plus_plus_17:
+    return {{DW_LNAME_C_plus_plus, 201703}};
+  case DW_LANG_C_plus_plus_20:
+    return {{DW_LNAME_C_plus_plus, 202002}};
+  case DW_LANG_Cobol74:
+    return {{DW_LNAME_Cobol, 1974}};
+  case DW_LANG_Cobol85:
+    return {{DW_LNAME_Cobol, 1985}};
+  case DW_LANG_Crystal:
+    return {{DW_LNAME_Crystal, 0}};
+  case DW_LANG_D:
+    return {{DW_LNAME_D, 0}};
+  case DW_LANG_Dylan:
+    return {{DW_LNAME_Dylan, 0}};
+  case DW_LANG_Fortran77:
+    return {{DW_LNAME_Fortran, 1977}};
+  case DW_LANG_Fortran90:
+    return {{DW_LNAME_Fortran, 1990}};
+  case DW_LANG_Fortran95:
+    return {{DW_LNAME_Fortran, 1995}};
+  case DW_LANG_Fortran03:
+    return {{DW_LNAME_Fortran, 2003}};
+  case DW_LANG_Fortran08:
+    return {{DW_LNAME_Fortran, 2008}};
+  case DW_LANG_Fortran18:
+    return {{DW_LNAME_Fortran, 2018}};
+  case DW_LANG_Go:
+    return {{DW_LNAME_Go, 0}};
+  case DW_LANG_Haskell:
+    return {{DW_LNAME_Haskell, 0}};
+  case DW_LANG_HIP:
+    return {}; // return {{DW_LNAME_HIP, 0}};
+  case DW_LANG_Java:
+    return {{DW_LNAME_Java, 0}};
+  case DW_LANG_Julia:
+    return {{DW_LNAME_Julia, 0}};
+  case DW_LANG_Kotlin:
+    return {{DW_LNAME_Kotlin, 0}};
+  case DW_LANG_Modula2:
+    return {{DW_LNAME_Modula2, 0}};
+  case DW_LANG_Modula3:
+    return {{DW_LNAME_Modula3, 0}};
+  case DW_LANG_ObjC:
+    return {{DW_LNAME_ObjC, 0}};
+  case DW_LANG_ObjC_plus_plus:
+    return {{DW_LNAME_ObjC_plus_plus, 0}};
+  case DW_LANG_OCaml:
+    return {{DW_LNAME_OCaml, 0}};
+  case DW_LANG_OpenCL:
+    return {{DW_LNAME_OpenCL_C, 0}};
+  case DW_LANG_Pascal83:
+    return {{DW_LNAME_Pascal, 1983}};
+  case DW_LANG_PLI:
+    return {{DW_LNAME_PLI, 0}};
+  case DW_LANG_Python:
+    return {{DW_LNAME_Python, 0}};
+  case DW_LANG_RenderScript:
+  case DW_LANG_GOOGLE_RenderScript:
+    return {{DW_LNAME_RenderScript, 0}};
+  case DW_LANG_Rust:
+    return {{DW_LNAME_Rust, 0}};
+  case DW_LANG_Swift:
+    return {{DW_LNAME_Swift, 0}};
+  case DW_LANG_UPC:
+    return {{DW_LNAME_UPC, 0}};
+  case DW_LANG_Zig:
+    return {{DW_LNAME_Zig, 0}};
+  case DW_LANG_Assembly:
+  case DW_LANG_Mips_Assembler:
+    return {{DW_LNAME_Assembly, 0}};
+  case DW_LANG_C_sharp:
+    return {{DW_LNAME_C_sharp, 0}};
+  case DW_LANG_Mojo:
+    return {{DW_LNAME_Mojo, 0}};
+  case DW_LANG_GLSL:
+    return {{DW_LNAME_GLSL, 0}};
+  case DW_LANG_GLSL_ES:
+    return {{DW_LNAME_GLSL_ES, 0}};
+  case DW_LANG_HLSL:
+    return {{DW_LNAME_HLSL, 0}};
+  case DW_LANG_OpenCL_CPP:
+    return {{DW_LNAME_OpenCL_CPP, 0}};
+  case DW_LANG_SYCL:
+    return {{DW_LNAME_SYCL, 0}};
+  case DW_LANG_Ruby:
+    return {{DW_LNAME_Ruby, 0}};
+  case DW_LANG_Move:
+    return {{DW_LNAME_Move, 0}};
+  case DW_LANG_Hylo:
+    return {{DW_LNAME_Hylo, 0}};
+  case DW_LANG_BORLAND_Delphi:
+  case DW_LANG_CPP_for_OpenCL:
+  case DW_LANG_lo_user:
+  case DW_LANG_hi_user:
+    return {};
+  }
+  return {};
+}
+
+llvm::StringRef LanguageDescription(SourceLanguageName name);
+
 inline bool isCPlusPlus(SourceLanguage S) {
   bool result = false;
   // Deliberately enumerate all the language options so we get a warning when
@@ -268,7 +546,19 @@ inline bool isCPlusPlus(SourceLanguage S) {
   case DW_LANG_Fortran18:
   case DW_LANG_Ada2005:
   case DW_LANG_Ada2012:
+  case DW_LANG_HIP:
+  case DW_LANG_Assembly:
+  case DW_LANG_C_sharp:
   case DW_LANG_Mojo:
+  case DW_LANG_GLSL:
+  case DW_LANG_GLSL_ES:
+  case DW_LANG_HLSL:
+  case DW_LANG_OpenCL_CPP:
+  case DW_LANG_CPP_for_OpenCL:
+  case DW_LANG_SYCL:
+  case DW_LANG_Ruby:
+  case DW_LANG_Move:
+  case DW_LANG_Hylo:
     result = false;
     break;
   }
@@ -335,7 +625,19 @@ inline bool isFortran(SourceLanguage S) {
   case DW_LANG_C17:
   case DW_LANG_Ada2005:
   case DW_LANG_Ada2012:
+  case DW_LANG_HIP:
+  case DW_LANG_Assembly:
+  case DW_LANG_C_sharp:
   case DW_LANG_Mojo:
+  case DW_LANG_GLSL:
+  case DW_LANG_GLSL_ES:
+  case DW_LANG_HLSL:
+  case DW_LANG_OpenCL_CPP:
+  case DW_LANG_CPP_for_OpenCL:
+  case DW_LANG_SYCL:
+  case DW_LANG_Ruby:
+  case DW_LANG_Move:
+  case DW_LANG_Hylo:
     result = false;
     break;
   }
@@ -400,7 +702,19 @@ inline bool isC(SourceLanguage S) {
   case DW_LANG_Fortran18:
   case DW_LANG_Ada2005:
   case DW_LANG_Ada2012:
+  case DW_LANG_HIP:
+  case DW_LANG_Assembly:
+  case DW_LANG_C_sharp:
   case DW_LANG_Mojo:
+  case DW_LANG_GLSL:
+  case DW_LANG_GLSL_ES:
+  case DW_LANG_HLSL:
+  case DW_LANG_OpenCL_CPP:
+  case DW_LANG_CPP_for_OpenCL:
+  case DW_LANG_SYCL:
+  case DW_LANG_Ruby:
+  case DW_LANG_Move:
+  case DW_LANG_Hylo:
     return false;
   }
   llvm_unreachable("Unknown language kind.");
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 4a3a03dc5ad4..92b51438b4cb 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1662,12 +1662,12 @@ public:
           TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()),
           std::nullopt, CostKind, Index, cast<VectorType>(Args[1]->getType()));
     }
-    case Intrinsic::experimental_vector_reverse: {
+    case Intrinsic::vector_reverse: {
       return thisT()->getShuffleCost(
           TTI::SK_Reverse, cast<VectorType>(Args[0]->getType()), std::nullopt,
           CostKind, 0, cast<VectorType>(RetTy));
     }
-    case Intrinsic::experimental_vector_splice: {
+    case Intrinsic::vector_splice: {
       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
       return thisT()->getShuffleCost(
           TTI::SK_Splice, cast<VectorType>(Args[0]->getType()), std::nullopt,
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 4b8aec8e8a5d..76e8d1166ae0 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -853,6 +853,9 @@ public:
   bool matchExtractVectorElementWithDifferentIndices(const MachineOperand &MO,
                                                      BuildFnTy &MatchInfo);
 
+  /// Combine insert vector element OOB.
+  bool matchInsertVectorElementOOB(MachineInstr &MI, BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 5f28908e998a..deae2c55d26e 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -247,8 +247,8 @@ private:
   bool translateTrap(const CallInst &U, MachineIRBuilder &MIRBuilder,
                      unsigned Opcode);
 
-  // Translate @llvm.experimental.vector.interleave2 and
-  // @llvm.experimental.vector.deinterleave2 intrinsics for fixed-width vector
+  // Translate @llvm.vector.interleave2 and
+  // @llvm.vector.deinterleave2 intrinsics for fixed-width vector
   // types into vector shuffles.
   bool translateVectorInterleave2Intrinsic(const CallInst &CI,
                                            MachineIRBuilder &MIRBuilder);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
index c4174cee5e10..70421a518ab7 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
@@ -559,5 +559,31 @@ void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI);
 /// having only floating-point operands.
 bool isPreISelGenericFloatingPointOpcode(unsigned Opc);
 
+/// Returns true if \p Reg can create undef or poison from non-undef &
+/// non-poison operands. \p ConsiderFlagsAndMetadata controls whether poison
+/// producing flags and metadata on the instruction are considered. This can be
+/// used to see if the instruction could still introduce undef or poison even
+/// without poison generating flags and metadata which might be on the
+/// instruction.
+bool canCreateUndefOrPoison(Register Reg, const MachineRegisterInfo &MRI,
+                            bool ConsiderFlagsAndMetadata = true);
+
+/// Returns true if \p Reg can create poison from non-poison operands.
+bool canCreatePoison(Register Reg, const MachineRegisterInfo &MRI,
+                     bool ConsiderFlagsAndMetadata = true);
+
+/// Returns true if \p Reg cannot be poison and undef.
+bool isGuaranteedNotToBeUndefOrPoison(Register Reg,
+                                      const MachineRegisterInfo &MRI,
+                                      unsigned Depth = 0);
+
+/// Returns true if \p Reg cannot be poison, but may be undef.
+bool isGuaranteedNotToBePoison(Register Reg, const MachineRegisterInfo &MRI,
+                               unsigned Depth = 0);
+
+/// Returns true if \p Reg cannot be undef, but may be poison.
+bool isGuaranteedNotToBeUndef(Register Reg, const MachineRegisterInfo &MRI,
+                              unsigned Depth = 0);
+
 } // End namespace llvm.
 #endif
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 078a936b061a..6429947958ee 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -205,6 +205,7 @@ enum NodeType {
   /// CopyFromReg - This node indicates that the input value is a virtual or
   /// physical register that is defined outside of the scope of this
   /// SelectionDAG.  The register is available from the RegisterSDNode object.
+  /// Note that CopyFromReg is considered as also freezing the value.
   CopyFromReg,
 
   /// UNDEF - An undefined node.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 2dd978c7b584..661b2841c6ac 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3146,7 +3146,7 @@ public:
 
   /// Lower a deinterleave intrinsic to a target specific load intrinsic.
   /// Return true on success. Currently only supports
-  /// llvm.experimental.vector.deinterleave2
+  /// llvm.vector.deinterleave2
   ///
   /// \p DI is the deinterleave intrinsic.
   /// \p LI is the accompanying load instruction
@@ -3157,7 +3157,7 @@ public:
 
   /// Lower an interleave intrinsic to a target specific store intrinsic.
   /// Return true on success. Currently only supports
-  /// llvm.experimental.vector.interleave2
+  /// llvm.vector.interleave2
   ///
   /// \p II is the interleave intrinsic.
   /// \p SI is the accompanying store instruction
@@ -5238,6 +5238,9 @@ public:
   /// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
   SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const;
 
+  /// Expand fminimum/fmaximum into multiple comparison with selects.
+  SDValue expandFMINIMUM_FMAXIMUM(SDNode *N, SelectionDAG &DAG) const;
+
   /// Expand FP_TO_[US]INT_SAT into FP_TO_[US]INT and selects or min/max.
   /// \param N Node to expand
   /// \returns The expansion result
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index d22d2a8e948b..fe09bb8177c2 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -436,7 +436,6 @@ __OMP_RTL(__tgt_target_kernel, false, Int32, IdentPtr, Int64, Int32, Int32,
           VoidPtr, KernelArgsPtr)
 __OMP_RTL(__tgt_target_kernel_nowait, false, Int32, IdentPtr, Int64, Int32,
           Int32, VoidPtr, KernelArgsPtr, Int32, VoidPtr, Int32, VoidPtr)
-__OMP_RTL(__tgt_register_requires, false, Void, Int64)
 __OMP_RTL(__tgt_target_data_begin_mapper, false, Void, IdentPtr, Int64, Int32, VoidPtrPtr,
           VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr)
 __OMP_RTL(__tgt_target_data_begin_nowait_mapper, false, Void, IdentPtr, Int64, Int32,
@@ -1025,8 +1024,6 @@ __OMP_RTL_ATTRS(__tgt_target_kernel_nowait, ForkAttrs, SExt,
                 ParamAttrs(AttributeSet(), AttributeSet(), SExt, SExt,
                            AttributeSet(), AttributeSet(), SExt, AttributeSet(),
                            SExt))
-__OMP_RTL_ATTRS(__tgt_register_requires, ForkAttrs, AttributeSet(),
-                ParamAttrs())
 __OMP_RTL_ATTRS(__tgt_target_data_begin_mapper, ForkAttrs, AttributeSet(),
                 ParamAttrs(AttributeSet(), AttributeSet(), SExt))
 __OMP_RTL_ATTRS(__tgt_target_data_begin_nowait_mapper, ForkAttrs, AttributeSet(),
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index 60f41b30e91c..cb514cde95b5 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -46,6 +46,7 @@ typedef unsigned ID;
 
 class AssemblyAnnotationWriter;
 class Constant;
+class ConstantRange;
 struct DenormalMode;
 class DISubprogram;
 enum LibFunc : unsigned;
@@ -462,6 +463,9 @@ public:
   /// attributes for the given arg.
   void addDereferenceableOrNullParamAttr(unsigned ArgNo, uint64_t Bytes);
 
+  /// adds the range attribute to the list of attributes for the return value.
+  void addRangeRetAttr(const ConstantRange &CR);
+
   MaybeAlign getParamAlign(unsigned ArgNo) const {
     return AttributeSets.getParamAlignment(ArgNo);
   }
diff --git a/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h b/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h
index a3ebde709ae6..7525c9eb758b 100644
--- a/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h
+++ b/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h
@@ -76,7 +76,7 @@ void GenericConvergenceVerifier<ContextT>::visit(const InstructionT &I) {
           "Entry intrinsic cannot be preceded by a convergent operation in the "
           "same basic block.",
           {Context.print(&I)});
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   case CONV_ANCHOR:
     Check(!TokenDef,
           "Entry or anchor intrinsic cannot have a convergencectrl token "
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 081e72cc82b2..2e99c9e2ee3e 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -311,7 +311,8 @@ public:
 
   Value *getVariableLocationOp(unsigned OpIdx) const;
 
-  void replaceVariableLocationOp(Value *OldValue, Value *NewValue);
+  void replaceVariableLocationOp(Value *OldValue, Value *NewValue,
+                                 bool AllowEmpty = false);
   void replaceVariableLocationOp(unsigned OpIdx, Value *NewValue);
   /// Adding a new location operand will always result in this intrinsic using
   /// an ArgList, and must always be accompanied by a new expression that uses
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index a14e9dedef8c..a2678d69ce40 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2577,15 +2577,15 @@ def int_preserve_static_offset : DefaultAttrsIntrinsic<[llvm_ptr_ty],
 
 //===------------ Intrinsics to perform common vector shuffles ------------===//
 
-def int_experimental_vector_reverse : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                                            [LLVMMatchType<0>],
-                                                            [IntrNoMem]>;
+def int_vector_reverse : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                               [LLVMMatchType<0>],
+                                               [IntrNoMem]>;
 
-def int_experimental_vector_splice : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                                           [LLVMMatchType<0>,
-                                                            LLVMMatchType<0>,
-                                                            llvm_i32_ty],
-                                                           [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+def int_vector_splice : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                              [LLVMMatchType<0>,
+                                               LLVMMatchType<0>,
+                                               llvm_i32_ty],
+                                              [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
 //===---------- Intrinsics to query properties of scalable vectors --------===//
 def int_vscale : DefaultAttrsIntrinsic<[llvm_anyint_ty], [], [IntrNoMem]>;
@@ -2600,15 +2600,15 @@ def int_vector_extract : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                                                [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>]>;
 
 
-def int_experimental_vector_interleave2   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                                                  [LLVMHalfElementsVectorType<0>,
-                                                                   LLVMHalfElementsVectorType<0>],
-                                                                  [IntrNoMem]>;
+def int_vector_interleave2   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                                     [LLVMHalfElementsVectorType<0>,
+                                                      LLVMHalfElementsVectorType<0>],
+                                                     [IntrNoMem]>;
 
-def int_experimental_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType<0>,
-                                                                   LLVMHalfElementsVectorType<0>],
-                                                                  [llvm_anyvector_ty],
-                                                                  [IntrNoMem]>;
+def int_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType<0>,
+                                                      LLVMHalfElementsVectorType<0>],
+                                                     [llvm_anyvector_ty],
+                                                     [IntrNoMem]>;
 
 //===----------------- Pointer Authentication Intrinsics ------------------===//
 //
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index bcaa37de74b6..e31e00a9c76f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1762,6 +1762,7 @@ def int_aarch64_sve_uqsub_x   : AdvSIMD_2VectorArg_Intrinsic;
 def int_aarch64_sve_orqv      : AdvSIMD_SVE_V128_Reduce_Intrinsic;
 def int_aarch64_sve_eorqv     : AdvSIMD_SVE_V128_Reduce_Intrinsic;
 def int_aarch64_sve_andqv     : AdvSIMD_SVE_V128_Reduce_Intrinsic;
+def int_aarch64_sve_addqv     : AdvSIMD_SVE_V128_Reduce_Intrinsic;
 def int_aarch64_sve_smaxqv    : AdvSIMD_SVE_V128_Reduce_Intrinsic;
 def int_aarch64_sve_umaxqv    : AdvSIMD_SVE_V128_Reduce_Intrinsic;
 def int_aarch64_sve_sminqv    : AdvSIMD_SVE_V128_Reduce_Intrinsic;
@@ -2079,11 +2080,12 @@ def int_aarch64_sve_fmaxv   : AdvSIMD_SVE_Reduce_Intrinsic;
 def int_aarch64_sve_fmaxnmv : AdvSIMD_SVE_Reduce_Intrinsic;
 def int_aarch64_sve_fminv   : AdvSIMD_SVE_Reduce_Intrinsic;
 def int_aarch64_sve_fminnmv : AdvSIMD_SVE_Reduce_Intrinsic;
-def int_aarch64_sve_addqv : AdvSIMD_SVE_V128_Reduce_Intrinsic;
+
+def int_aarch64_sve_faddqv   : AdvSIMD_SVE_V128_Reduce_Intrinsic;
 def int_aarch64_sve_fmaxnmqv : AdvSIMD_SVE_V128_Reduce_Intrinsic;
 def int_aarch64_sve_fminnmqv : AdvSIMD_SVE_V128_Reduce_Intrinsic;
-def int_aarch64_sve_fmaxqv : AdvSIMD_SVE_V128_Reduce_Intrinsic;
-def int_aarch64_sve_fminqv : AdvSIMD_SVE_V128_Reduce_Intrinsic;
+def int_aarch64_sve_fmaxqv   : AdvSIMD_SVE_V128_Reduce_Intrinsic;
+def int_aarch64_sve_fminqv   : AdvSIMD_SVE_V128_Reduce_Intrinsic;
 
 //
 // Floating-point conversions
@@ -3646,4 +3648,4 @@ def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic;
 
 def int_aarch64_sve_pmov_to_vector_lane_merging : SVE2_Pred_1VectorArgIndexed_Intrinsic;
    
-def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic;
-\ No newline at end of file
+def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic;
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 0b13b4aad9c3..739208e74dcb 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2513,7 +2513,7 @@ inline typename m_Intrinsic_Ty<Opnd0, Opnd1>::Ty m_CopySign(const Opnd0 &Op0,
 
 template <typename Opnd0>
 inline typename m_Intrinsic_Ty<Opnd0>::Ty m_VecReverse(const Opnd0 &Op0) {
-  return m_Intrinsic<Intrinsic::experimental_vector_reverse>(Op0);
+  return m_Intrinsic<Intrinsic::vector_reverse>(Op0);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h
index 4494d9b96189..8cc09e7fd7d5 100644
--- a/llvm/include/llvm/Object/ELFObjectFile.h
+++ b/llvm/include/llvm/Object/ELFObjectFile.h
@@ -199,6 +199,14 @@ public:
   }
 };
 
+inline bool operator<(const ELFSymbolRef &A, const ELFSymbolRef &B) {
+  const DataRefImpl &DRIA = A.getRawDataRefImpl();
+  const DataRefImpl &DRIB = B.getRawDataRefImpl();
+  if (DRIA.d.a == DRIB.d.a)
+    return DRIA.d.b < DRIB.d.b;
+  return DRIA.d.a < DRIB.d.a;
+}
+
 class elf_symbol_iterator : public symbol_iterator {
 public:
   elf_symbol_iterator(const basic_symbol_iterator &B)
diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index d378c3696f8d..4274f2a6849b 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -2,6 +2,7 @@
 #define LLVM_PROFILEDATA_MEMPROF_H_
 
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/GlobalValue.h"
@@ -10,6 +11,7 @@
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/raw_ostream.h"
 
+#include <bitset>
 #include <cstdint>
 #include <optional>
 
@@ -55,7 +57,10 @@ MemProfSchema getHotColdSchema();
 // deserialize methods.
 struct PortableMemInfoBlock {
   PortableMemInfoBlock() = default;
-  explicit PortableMemInfoBlock(const MemInfoBlock &Block) {
+  explicit PortableMemInfoBlock(const MemInfoBlock &Block,
+                                const MemProfSchema &IncomingSchema) {
+    for (const Meta Id : IncomingSchema)
+      Schema.set(llvm::to_underlying(Id));
 #define MIBEntryDef(NameTag, Name, Type) Name = Block.Name;
 #include "llvm/ProfileData/MIBEntryDef.inc"
 #undef MIBEntryDef
@@ -67,10 +72,12 @@ struct PortableMemInfoBlock {
 
   // Read the contents of \p Ptr based on the \p Schema to populate the
   // MemInfoBlock member.
-  void deserialize(const MemProfSchema &Schema, const unsigned char *Ptr) {
+  void deserialize(const MemProfSchema &IncomingSchema,
+                   const unsigned char *Ptr) {
     using namespace support;
 
-    for (const Meta Id : Schema) {
+    Schema.reset();
+    for (const Meta Id : IncomingSchema) {
       switch (Id) {
 #define MIBEntryDef(NameTag, Name, Type)                                       \
   case Meta::Name: {                                                           \
@@ -82,6 +89,8 @@ struct PortableMemInfoBlock {
         llvm_unreachable("Unknown meta type id, is the profile collected from "
                          "a newer version of the runtime?");
       }
+
+      Schema.set(llvm::to_underlying(Id));
     }
   }
 
@@ -114,17 +123,29 @@ struct PortableMemInfoBlock {
 #undef MIBEntryDef
   }
 
+  // Return the schema, only for unit tests.
+  std::bitset<llvm::to_underlying(Meta::Size)> getSchema() const {
+    return Schema;
+  }
+
   // Define getters for each type which can be called by analyses.
 #define MIBEntryDef(NameTag, Name, Type)                                       \
-  Type get##Name() const { return Name; }
+  Type get##Name() const {                                                     \
+    assert(Schema[llvm::to_underlying(Meta::Name)]);                           \
+    return Name;                                                               \
+  }
 #include "llvm/ProfileData/MIBEntryDef.inc"
 #undef MIBEntryDef
 
   void clear() { *this = PortableMemInfoBlock(); }
 
   bool operator==(const PortableMemInfoBlock &Other) const {
+    if (Other.Schema != Schema)
+      return false;
+
 #define MIBEntryDef(NameTag, Name, Type)                                       \
-  if (Other.get##Name() != get##Name())                                        \
+  if (Schema[llvm::to_underlying(Meta::Name)] &&                               \
+      Other.get##Name() != get##Name())                                        \
     return false;
 #include "llvm/ProfileData/MIBEntryDef.inc"
 #undef MIBEntryDef
@@ -155,6 +176,9 @@ struct PortableMemInfoBlock {
   }
 
 private:
+  // The set of available fields, indexed by Meta::Name.
+  std::bitset<llvm::to_underlying(Meta::Size)> Schema;
+
 #define MIBEntryDef(NameTag, Name, Type) Type Name = Type();
 #include "llvm/ProfileData/MIBEntryDef.inc"
 #undef MIBEntryDef
@@ -296,8 +320,9 @@ struct IndexedAllocationInfo {
 
   IndexedAllocationInfo() = default;
   IndexedAllocationInfo(ArrayRef<FrameId> CS, CallStackId CSId,
-                        const MemInfoBlock &MB)
-      : CallStack(CS.begin(), CS.end()), CSId(CSId), Info(MB) {}
+                        const MemInfoBlock &MB,
+                        const MemProfSchema &Schema = getFullSchema())
+      : CallStack(CS.begin(), CS.end()), CSId(CSId), Info(MB, Schema) {}
 
   // Returns the size in bytes when this allocation info struct is serialized.
   size_t serializedSize(const MemProfSchema &Schema,
@@ -737,6 +762,64 @@ public:
 // Compute a CallStackId for a given call stack.
 CallStackId hashCallStack(ArrayRef<FrameId> CS);
 
+namespace detail {
+// "Dereference" the iterator from DenseMap or OnDiskChainedHashTable.  We have
+// to do so in one of two different ways depending on the type of the hash
+// table.
+template <typename value_type, typename IterTy>
+value_type DerefIterator(IterTy Iter) {
+  using deref_type = llvm::remove_cvref_t<decltype(*Iter)>;
+  if constexpr (std::is_same_v<deref_type, value_type>)
+    return *Iter;
+  else
+    return Iter->second;
+}
+} // namespace detail
+
+// A function object that returns a frame for a given FrameId.
+template <typename MapTy> struct FrameIdConverter {
+  std::optional<FrameId> LastUnmappedId;
+  MapTy &Map;
+
+  FrameIdConverter() = delete;
+  FrameIdConverter(MapTy &Map) : Map(Map) {}
+
+  Frame operator()(FrameId Id) {
+    auto Iter = Map.find(Id);
+    if (Iter == Map.end()) {
+      LastUnmappedId = Id;
+      return Frame(0, 0, 0, false);
+    }
+    return detail::DerefIterator<Frame>(Iter);
+  }
+};
+
+// A function object that returns a call stack for a given CallStackId.
+template <typename MapTy> struct CallStackIdConverter {
+  std::optional<CallStackId> LastUnmappedId;
+  MapTy &Map;
+  std::function<Frame(FrameId)> FrameIdToFrame;
+
+  CallStackIdConverter() = delete;
+  CallStackIdConverter(MapTy &Map, std::function<Frame(FrameId)> FrameIdToFrame)
+      : Map(Map), FrameIdToFrame(FrameIdToFrame) {}
+
+  llvm::SmallVector<Frame> operator()(CallStackId CSId) {
+    llvm::SmallVector<Frame> Frames;
+    auto CSIter = Map.find(CSId);
+    if (CSIter == Map.end()) {
+      LastUnmappedId = CSId;
+    } else {
+      llvm::SmallVector<FrameId> CS =
+          detail::DerefIterator<llvm::SmallVector<FrameId>>(CSIter);
+      Frames.reserve(CS.size());
+      for (FrameId Id : CS)
+        Frames.push_back(FrameIdToFrame(Id));
+    }
+    return Frames;
+  }
+};
+
 // Verify that each CallStackId is computed with hashCallStack.  This function
 // is intended to help transition from CallStack to CSId in
 // IndexedAllocationInfo.
diff --git a/llvm/include/llvm/ProfileData/MemProfReader.h b/llvm/include/llvm/ProfileData/MemProfReader.h
index 444c58e8bdc8..b42e4f597774 100644
--- a/llvm/include/llvm/ProfileData/MemProfReader.h
+++ b/llvm/include/llvm/ProfileData/MemProfReader.h
@@ -76,20 +76,16 @@ public:
       Callback =
           std::bind(&MemProfReader::idToFrame, this, std::placeholders::_1);
 
-    auto CallStackCallback = [&](CallStackId CSId) {
-      llvm::SmallVector<Frame> CallStack;
-      auto Iter = CSIdToCallStack.find(CSId);
-      assert(Iter != CSIdToCallStack.end());
-      for (FrameId Id : Iter->second)
-        CallStack.push_back(Callback(Id));
-      return CallStack;
-    };
+    memprof::CallStackIdConverter<decltype(CSIdToCallStack)> CSIdConv(
+        CSIdToCallStack, Callback);
 
     const IndexedMemProfRecord &IndexedRecord = Iter->second;
     GuidRecord = {
         Iter->first,
-        IndexedRecord.toMemProfRecord(CallStackCallback),
+        IndexedRecord.toMemProfRecord(CSIdConv),
     };
+    if (CSIdConv.LastUnmappedId)
+      return make_error<InstrProfError>(instrprof_error::hash_mismatch);
     Iter++;
     return Error::success();
   }
diff --git a/llvm/include/llvm/Support/RISCVISAUtils.h b/llvm/include/llvm/Support/RISCVISAUtils.h
index 94aedb75faa2..77f8c3e45f1a 100644
--- a/llvm/include/llvm/Support/RISCVISAUtils.h
+++ b/llvm/include/llvm/Support/RISCVISAUtils.h
@@ -14,6 +14,7 @@
 #define LLVM_SUPPORT_RISCVISAUTILS_H
 
 #include "llvm/ADT/StringRef.h"
+#include <map>
 #include <string>
 
 namespace llvm {
@@ -35,6 +36,12 @@ struct ExtensionComparator {
     return compareExtension(LHS, RHS);
   }
 };
+
+/// OrderedExtensionMap is std::map, it's specialized to keep entries
+/// in canonical order of extension.
+typedef std::map<std::string, ExtensionVersion, ExtensionComparator>
+    OrderedExtensionMap;
+
 } // namespace RISCVISAUtils
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Support/YAMLTraits.h b/llvm/include/llvm/Support/YAMLTraits.h
index 3b1f4bad57fc..33aeb039320d 100644
--- a/llvm/include/llvm/Support/YAMLTraits.h
+++ b/llvm/include/llvm/Support/YAMLTraits.h
@@ -671,7 +671,11 @@ inline bool isBool(StringRef S) {
 // (except for TAB #x9, LF #xA, and CR #xD which are allowed), DEL #x7F, the C1
 // control block #x80-#x9F (except for NEL #x85 which is allowed), the surrogate
 // block #xD800-#xDFFF, #xFFFE, and #xFFFF.
-inline QuotingType needsQuotes(StringRef S) {
+//
+// Some strings are valid YAML values even unquoted, but without quotes are
+// interpreted as non-string type, for instance null, boolean or numeric values.
+// If ForcePreserveAsString is set, such strings are quoted.
+inline QuotingType needsQuotes(StringRef S, bool ForcePreserveAsString = true) {
   if (S.empty())
     return QuotingType::Single;
 
@@ -679,12 +683,14 @@ inline QuotingType needsQuotes(StringRef S) {
   if (isSpace(static_cast<unsigned char>(S.front())) ||
       isSpace(static_cast<unsigned char>(S.back())))
     MaxQuotingNeeded = QuotingType::Single;
-  if (isNull(S))
-    MaxQuotingNeeded = QuotingType::Single;
-  if (isBool(S))
-    MaxQuotingNeeded = QuotingType::Single;
-  if (isNumeric(S))
-    MaxQuotingNeeded = QuotingType::Single;
+  if (ForcePreserveAsString) {
+    if (isNull(S))
+      MaxQuotingNeeded = QuotingType::Single;
+    if (isBool(S))
+      MaxQuotingNeeded = QuotingType::Single;
+    if (isNumeric(S))
+      MaxQuotingNeeded = QuotingType::Single;
+  }
 
   // 7.3.3 Plain Style
   // Plain scalars must not begin with most indicators, as this would cause
@@ -1636,6 +1642,7 @@ public:
 
 private:
   void output(StringRef s);
+  void output(StringRef, QuotingType);
   void outputUpToEndOfLine(StringRef s);
   void newLineCheck(bool EmptySequence = false);
   void outputNewLine();
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 31b903e63d99..dbbb3abaa830 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1525,11 +1525,39 @@ def combine_shuffle_concat : GICombineRule<
         [{ return Helper.matchCombineShuffleConcat(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyCombineShuffleConcat(*${root}, ${matchinfo}); }])>;
 
-// match_extract_of_element must be the first!
+def insert_vector_element_idx_undef : GICombineRule<
+   (defs root:$root, build_fn_matchinfo:$matchinfo),
+   (match (G_IMPLICIT_DEF $idx),
+          (G_INSERT_VECTOR_ELT $root, $src, $elt, $idx)),
+   (apply (G_IMPLICIT_DEF $root))>;
+
+def insert_vector_element_elt_undef : GICombineRule<
+   (defs root:$root, build_fn_matchinfo:$matchinfo),
+   (match (G_IMPLICIT_DEF $elt),
+          (G_INSERT_VECTOR_ELT $root, $src, $elt, $idx),
+          [{ return isGuaranteedNotToBePoison(${src}.getReg(), MRI); }]),
+   (apply (GIReplaceReg $root, $src))>;
+
+def insert_vector_element_extract_vector_element : GICombineRule<
+   (defs root:$root, build_fn_matchinfo:$matchinfo),
+   (match (G_EXTRACT_VECTOR_ELT $elt, $src, $idx),
+          (G_INSERT_VECTOR_ELT $root, $src, $elt, $idx)),
+   (apply (GIReplaceReg $root, $src))>;
+
+def insert_vector_elt_oob : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_INSERT_VECTOR_ELT):$root,
+         [{ return Helper.matchInsertVectorElementOOB(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
+// match_extract_of_element and insert_vector_elt_oob must be the first!
 def vector_ops_combines: GICombineGroup<[
 match_extract_of_element_undef_vector,
 match_extract_of_element_undef_index,
+insert_vector_element_idx_undef,
+insert_vector_element_elt_undef,
 match_extract_of_element,
+insert_vector_elt_oob,
 extract_vector_element_not_const,
 extract_vector_element_different_indices,
 extract_vector_element_build_vector2,
@@ -1553,7 +1581,8 @@ extract_vector_element_build_vector_trunc5,
 extract_vector_element_build_vector_trunc6,
 extract_vector_element_build_vector_trunc7,
 extract_vector_element_build_vector_trunc8,
-extract_vector_element_freeze
+extract_vector_element_freeze,
+insert_vector_element_extract_vector_element
 ]>;
 
 // FIXME: These should use the custom predicate feature once it lands.
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 805b963a7a13..0d1cfd152151 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -268,7 +268,7 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"sha3", AArch64::AEK_SHA3, "+sha3", "-sha3", FEAT_SHA3, "+sha3,+sha2,+fp-armv8,+neon", 140},
     {"simd", AArch64::AEK_SIMD, "+neon", "-neon", FEAT_SIMD, "+fp-armv8,+neon", 100},
     {"sm4", AArch64::AEK_SM4, "+sm4", "-sm4", FEAT_SM4, "+sm4,+fp-armv8,+neon", 106},
-    {"sme-f16f16", AArch64::AEK_SMEF16F16, "+sme-f16f16", "-sme-f16f16", FEAT_INIT, "", 0},
+    {"sme-f16f16", AArch64::AEK_SMEF16F16, "+sme-f16f16", "-sme-f16f16", FEAT_INIT, "+sme2,+sme-f16f16", 0},
     {"sme-f64f64", AArch64::AEK_SMEF64F64, "+sme-f64f64", "-sme-f64f64", FEAT_SME_F64, "+sme,+sme-f64f64,+bf16", 560},
     {"sme-i16i64", AArch64::AEK_SMEI16I64, "+sme-i16i64", "-sme-i16i64", FEAT_SME_I64, "+sme,+sme-i16i64,+bf16", 570},
     {"sme", AArch64::AEK_SME, "+sme", "-sme", FEAT_SME, "+sme,+bf16", 430},
@@ -302,7 +302,7 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"ssve-fp8dot4", AArch64::AEK_SSVE_FP8DOT4, "+ssve-fp8dot4", "-ssve-fp8dot4", FEAT_INIT, "+sme2", 0},
     {"lut", AArch64::AEK_LUT, "+lut", "-lut", FEAT_INIT, "", 0},
     {"sme-lutv2", AArch64::AEK_SME_LUTv2, "+sme-lutv2", "-sme-lutv2", FEAT_INIT, "", 0},
-    {"sme-f8f16", AArch64::AEK_SMEF8F16, "+sme-f8f16", "-sme-f8f16", FEAT_INIT, "+sme2,+fp8", 0},
+    {"sme-f8f16", AArch64::AEK_SMEF8F16, "+sme-f8f16", "-sme-f8f16", FEAT_INIT, "+fp8,+sme2", 0},
     {"sme-f8f32", AArch64::AEK_SMEF8F32, "+sme-f8f32", "-sme-f8f32", FEAT_INIT, "+sme2,+fp8", 0},
     {"sme-fa64",  AArch64::AEK_SMEFA64,  "+sme-fa64", "-sme-fa64",  FEAT_INIT, "", 0},
     {"cpa", AArch64::AEK_CPA, "+cpa", "-cpa", FEAT_INIT, "", 0},
@@ -677,6 +677,13 @@ inline constexpr CpuInfo CpuInfos[] = {
           AArch64::AEK_FP16FML, AArch64::AEK_I8MM, AArch64::AEK_MTE,
           AArch64::AEK_SB, AArch64::AEK_SSBS, AArch64::AEK_SVE,
           AArch64::AEK_SVE2, AArch64::AEK_SVE2BITPERM})},
+    {"neoverse-n3", ARMV9_2A,
+     AArch64::ExtensionBitset({AArch64::AEK_MTE, AArch64::AEK_SSBS,
+                               AArch64::AEK_SB, AArch64::AEK_PREDRES,
+                               AArch64::AEK_FP16FML, AArch64::AEK_PAUTH,
+                               AArch64::AEK_FLAGM, AArch64::AEK_PERFMON,
+                               AArch64::AEK_RAND, AArch64::AEK_SVE2BITPERM,
+                               AArch64::AEK_PROFILE, AArch64::AEK_PERFMON})},
     {"neoverse-512tvb", ARMV8_4A,
      AArch64::ExtensionBitset(
          {AArch64::AEK_AES, AArch64::AEK_SHA2, AArch64::AEK_SHA3,
@@ -697,6 +704,20 @@ inline constexpr CpuInfo CpuInfos[] = {
           AArch64::AEK_FP16, AArch64::AEK_BF16, AArch64::AEK_RAND,
           AArch64::AEK_DOTPROD, AArch64::AEK_PROFILE, AArch64::AEK_SVE2BITPERM,
           AArch64::AEK_FP16FML, AArch64::AEK_I8MM, AArch64::AEK_MTE})},
+    {"neoverse-v3", ARMV9_2A,
+     AArch64::ExtensionBitset(
+         {AArch64::AEK_PROFILE, AArch64::AEK_MTE, AArch64::AEK_SSBS,
+          AArch64::AEK_SB, AArch64::AEK_PREDRES, AArch64::AEK_LS64,
+          AArch64::AEK_BRBE, AArch64::AEK_PAUTH, AArch64::AEK_FLAGM,
+          AArch64::AEK_PERFMON, AArch64::AEK_RAND, AArch64::AEK_SVE2BITPERM,
+          AArch64::AEK_FP16FML})},
+    {"neoverse-v3ae", ARMV9_2A,
+     (AArch64::ExtensionBitset(
+         {AArch64::AEK_PROFILE, AArch64::AEK_MTE, AArch64::AEK_SSBS,
+          AArch64::AEK_SB, AArch64::AEK_PREDRES, AArch64::AEK_LS64,
+          AArch64::AEK_BRBE, AArch64::AEK_PAUTH, AArch64::AEK_FLAGM,
+          AArch64::AEK_PERFMON, AArch64::AEK_RAND, AArch64::AEK_SVE2BITPERM,
+          AArch64::AEK_FP16FML}))},
     {"cyclone", ARMV8A,
      AArch64::ExtensionBitset(
          {AArch64::AEK_AES, AArch64::AEK_SHA2, AArch64::AEK_NONE})},
diff --git a/llvm/include/llvm/TargetParser/RISCVISAInfo.h b/llvm/include/llvm/TargetParser/RISCVISAInfo.h
index 83c4f1e620fc..0d5637155daa 100644
--- a/llvm/include/llvm/TargetParser/RISCVISAInfo.h
+++ b/llvm/include/llvm/TargetParser/RISCVISAInfo.h
@@ -26,13 +26,7 @@ public:
   RISCVISAInfo(const RISCVISAInfo &) = delete;
   RISCVISAInfo &operator=(const RISCVISAInfo &) = delete;
 
-  /// OrderedExtensionMap is std::map, it's specialized to keep entries
-  /// in canonical order of extension.
-  typedef std::map<std::string, RISCVISAUtils::ExtensionVersion,
-                   RISCVISAUtils::ExtensionComparator>
-      OrderedExtensionMap;
-
-  RISCVISAInfo(unsigned XLen, OrderedExtensionMap &Exts)
+  RISCVISAInfo(unsigned XLen, RISCVISAUtils::OrderedExtensionMap &Exts)
       : XLen(XLen), FLen(0), MinVLen(0), MaxELen(0), MaxELenFp(0), Exts(Exts) {}
 
   /// Parse RISC-V ISA info from arch string.
@@ -59,7 +53,9 @@ public:
   std::vector<std::string> toFeatures(bool AddAllExtensions = false,
                                       bool IgnoreUnknown = true) const;
 
-  const OrderedExtensionMap &getExtensions() const { return Exts; }
+  const RISCVISAUtils::OrderedExtensionMap &getExtensions() const {
+    return Exts;
+  }
 
   unsigned getXLen() const { return XLen; }
   unsigned getFLen() const { return FLen; }
@@ -90,7 +86,7 @@ private:
   unsigned MinVLen;
   unsigned MaxELen, MaxELenFp;
 
-  OrderedExtensionMap Exts;
+  RISCVISAUtils::OrderedExtensionMap Exts;
 
   void addExtension(StringRef ExtName, RISCVISAUtils::ExtensionVersion Version);
 
diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
index ea1f4fc3b85d..855d1aeddfae 100644
--- a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
+++ b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
@@ -461,9 +461,10 @@ public:
 
   OverflowResult computeOverflowForUnsignedMul(const Value *LHS,
                                                const Value *RHS,
-                                               const Instruction *CxtI) const {
-    return llvm::computeOverflowForUnsignedMul(LHS, RHS,
-                                               SQ.getWithInstruction(CxtI));
+                                               const Instruction *CxtI,
+                                               bool IsNSW = false) const {
+    return llvm::computeOverflowForUnsignedMul(
+        LHS, RHS, SQ.getWithInstruction(CxtI), IsNSW);
   }
 
   OverflowResult computeOverflowForSignedMul(const Value *LHS, const Value *RHS,
diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h
index 4ba9b74ccb00..debe2ee79917 100644
--- a/llvm/include/llvm/Transforms/Scalar/GVN.h
+++ b/llvm/include/llvm/Transforms/Scalar/GVN.h
@@ -232,13 +232,67 @@ private:
 
   /// A mapping from value numbers to lists of Value*'s that
   /// have that value number.  Use findLeader to query it.
-  struct LeaderTableEntry {
-    Value *Val;
-    const BasicBlock *BB;
-    LeaderTableEntry *Next;
+  class LeaderMap {
+  public:
+    struct LeaderTableEntry {
+      Value *Val;
+      const BasicBlock *BB;
+    };
+
+  private:
+    struct LeaderListNode {
+      LeaderTableEntry Entry;
+      LeaderListNode *Next;
+    };
+    DenseMap<uint32_t, LeaderListNode> NumToLeaders;
+    BumpPtrAllocator TableAllocator;
+
+  public:
+    class leader_iterator {
+      const LeaderListNode *Current;
+
+    public:
+      using iterator_category = std::forward_iterator_tag;
+      using value_type = const LeaderTableEntry;
+      using difference_type = std::ptrdiff_t;
+      using pointer = value_type *;
+      using reference = value_type &;
+
+      leader_iterator(const LeaderListNode *C) : Current(C) {}
+      leader_iterator &operator++() {
+        assert(Current && "Dereferenced end of leader list!");
+        Current = Current->Next;
+        return *this;
+      }
+      bool operator==(const leader_iterator &Other) const {
+        return Current == Other.Current;
+      }
+      bool operator!=(const leader_iterator &Other) const {
+        return Current != Other.Current;
+      }
+      reference operator*() const { return Current->Entry; }
+    };
+
+    iterator_range<leader_iterator> getLeaders(uint32_t N) {
+      auto I = NumToLeaders.find(N);
+      if (I == NumToLeaders.end()) {
+        return iterator_range(leader_iterator(nullptr),
+                              leader_iterator(nullptr));
+      }
+
+      return iterator_range(leader_iterator(&I->second),
+                            leader_iterator(nullptr));
+    }
+
+    void insert(uint32_t N, Value *V, const BasicBlock *BB);
+    void erase(uint32_t N, Instruction *I, const BasicBlock *BB);
+    void verifyRemoved(const Value *Inst) const;
+    void clear() {
+      NumToLeaders.clear();
+      TableAllocator.Reset();
+    }
   };
-  DenseMap<uint32_t, LeaderTableEntry> LeaderTable;
-  BumpPtrAllocator TableAllocator;
+  LeaderMap LeaderTable;
 
   // Block-local map of equivalent values to their leader, does not
   // propagate to any successors. Entries added mid-block are applied
@@ -264,51 +318,6 @@ private:
                MemoryDependenceResults *RunMD, LoopInfo &LI,
                OptimizationRemarkEmitter *ORE, MemorySSA *MSSA = nullptr);
 
-  /// Push a new Value to the LeaderTable onto the list for its value number.
-  void addToLeaderTable(uint32_t N, Value *V, const BasicBlock *BB) {
-    LeaderTableEntry &Curr = LeaderTable[N];
-    if (!Curr.Val) {
-      Curr.Val = V;
-      Curr.BB = BB;
-      return;
-    }
-
-    LeaderTableEntry *Node = TableAllocator.Allocate<LeaderTableEntry>();
-    Node->Val = V;
-    Node->BB = BB;
-    Node->Next = Curr.Next;
-    Curr.Next = Node;
-  }
-
-  /// Scan the list of values corresponding to a given
-  /// value number, and remove the given instruction if encountered.
-  void removeFromLeaderTable(uint32_t N, Instruction *I, BasicBlock *BB) {
-    LeaderTableEntry *Prev = nullptr;
-    LeaderTableEntry *Curr = &LeaderTable[N];
-
-    while (Curr && (Curr->Val != I || Curr->BB != BB)) {
-      Prev = Curr;
-      Curr = Curr->Next;
-    }
-
-    if (!Curr)
-      return;
-
-    if (Prev) {
-      Prev->Next = Curr->Next;
-    } else {
-      if (!Curr->Next) {
-        Curr->Val = nullptr;
-        Curr->BB = nullptr;
-      } else {
-        LeaderTableEntry *Next = Curr->Next;
-        Curr->Val = Next->Val;
-        Curr->BB = Next->BB;
-        Curr->Next = Next->Next;
-      }
-    }
-  }
-
   // List of critical edges to be split between iterations.
   SmallVector<std::pair<Instruction *, unsigned>, 4> toSplit;
 
diff --git a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
index 3364d7eaee42..f7358ac9b1ee 100644
--- a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
+++ b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 #include <optional>
 #include <utility>
 
@@ -114,11 +115,10 @@ public:
   bool processBlock(BasicBlock *BB);
   bool maybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB);
   void updateSSA(BasicBlock *BB, BasicBlock *NewBB,
-                 DenseMap<Instruction *, Value *> &ValueMapping);
-  DenseMap<Instruction *, Value *> cloneInstructions(BasicBlock::iterator BI,
-                                                     BasicBlock::iterator BE,
-                                                     BasicBlock *NewBB,
-                                                     BasicBlock *PredBB);
+                 ValueToValueMapTy &ValueMapping);
+  void cloneInstructions(ValueToValueMapTy &ValueMapping,
+                         BasicBlock::iterator BI, BasicBlock::iterator BE,
+                         BasicBlock *NewBB, BasicBlock *PredBB);
   bool tryThreadEdge(BasicBlock *BB,
                      const SmallVectorImpl<BasicBlock *> &PredBBs,
                      BasicBlock *SuccBB);
diff --git a/llvm/include/llvm/Transforms/Utils/GlobalStatus.h b/llvm/include/llvm/Transforms/Utils/GlobalStatus.h
index 60c91fc30174..c001e587313c 100644
--- a/llvm/include/llvm/Transforms/Utils/GlobalStatus.h
+++ b/llvm/include/llvm/Transforms/Utils/GlobalStatus.h
@@ -24,9 +24,9 @@ class Value;
 ///
 bool isSafeToDestroyConstant(const Constant *C);
 
-/// As we analyze each global, keep track of some information about it.  If we
-/// find out that the address of the global is taken, none of this info will be
-/// accurate.
+/// As we analyze each global or thread-local variable, keep track of some
+/// information about it.  If we find out that the address of the global is
+/// taken, none of this info will be accurate.
 struct GlobalStatus {
   /// True if the global's address is used in a comparison.
   bool IsCompared = false;
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index e2143b5bfbe2..6937ec8dfd21 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -18,6 +18,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 #include <cstdint>
 
 namespace llvm {
@@ -490,6 +491,10 @@ void hoistAllInstructionsInto(BasicBlock *DomBlock, Instruction *InsertPt,
 DIExpression *getExpressionForConstant(DIBuilder &DIB, const Constant &C,
                                        Type &Ty);
 
+/// Remap the operands of the debug records attached to \p Inst, and the
+/// operands of \p Inst itself if it's a debug intrinsic.
+void remapDebugVariable(ValueToValueMapTy &Mapping, Instruction *Inst);
+
 //===----------------------------------------------------------------------===//
 //  Intrinsic pattern matching
 //
diff --git a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
index fb3ab33a0629..16589a605e60 100644
--- a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
+++ b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
@@ -86,6 +86,8 @@ Value *getFP(IRBuilder<> &IRB);
 Value *getPC(const Triple &TargetTriple, IRBuilder<> &IRB);
 Value *getAndroidSlotPtr(IRBuilder<> &IRB, int Slot);
 
+void annotateDebugRecords(AllocaInfo &Info, unsigned int Tag);
+
 } // namespace memtag
 } // namespace llvm
 
diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 326006fbb880..4f99d171469e 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -153,10 +153,15 @@ private:
   /// a vectorization chain.
   bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
 
-  bool vectorizeStoreChain(ArrayRef<Value *> Chain, slpvectorizer::BoUpSLP &R,
-                           unsigned Idx, unsigned MinVF);
-
-  bool vectorizeStores(ArrayRef<StoreInst *> Stores, slpvectorizer::BoUpSLP &R);
+  std::optional<bool> vectorizeStoreChain(ArrayRef<Value *> Chain,
+                                          slpvectorizer::BoUpSLP &R,
+                                          unsigned Idx, unsigned MinVF,
+                                          unsigned &Size);
+
+  bool vectorizeStores(
+      ArrayRef<StoreInst *> Stores, slpvectorizer::BoUpSLP &R,
+      DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
+          &Visited);
 
   /// The store instructions in a basic block organized by base pointer.
   StoreListMap Stores;
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index c06984c0d494..4061dae83c10 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -6281,11 +6281,11 @@ static Value *simplifyUnaryIntrinsic(Function *F, Value *Op0,
                m_Intrinsic<Intrinsic::pow>(m_SpecificFP(10.0), m_Value(X)))))
       return X;
     break;
-  case Intrinsic::experimental_vector_reverse:
-    // experimental.vector.reverse(experimental.vector.reverse(x)) -> x
+  case Intrinsic::vector_reverse:
+    // vector.reverse(vector.reverse(x)) -> x
     if (match(Op0, m_VecReverse(m_Value(X))))
       return X;
-    // experimental.vector.reverse(splat(X)) -> splat(X)
+    // vector.reverse(splat(X)) -> splat(X)
     if (isSplatValue(Op0))
       return Op0;
     break;
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index de38eddaa98f..1b461e7cfd01 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -6686,9 +6686,15 @@ llvm::computeConstantRangeIncludingKnownBits(const WithCache<const Value *> &V,
 
 OverflowResult llvm::computeOverflowForUnsignedMul(const Value *LHS,
                                                    const Value *RHS,
-                                                   const SimplifyQuery &SQ) {
+                                                   const SimplifyQuery &SQ,
+                                                   bool IsNSW) {
   KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, SQ);
   KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, SQ);
+
+  // mul nsw of two non-negative numbers is also nuw.
+  if (IsNSW && LHSKnown.isNonNegative() && RHSKnown.isNonNegative())
+    return OverflowResult::NeverOverflows;
+
   ConstantRange LHSRange = ConstantRange::fromKnownBits(LHSKnown, false);
   ConstantRange RHSRange = ConstantRange::fromKnownBits(RHSKnown, false);
   return mapOverflowResult(LHSRange.unsignedMulMayOverflow(RHSRange));
diff --git a/llvm/lib/BinaryFormat/Dwarf.cpp b/llvm/lib/BinaryFormat/Dwarf.cpp
index e4e5b5dd8c0e..732426617268 100644
--- a/llvm/lib/BinaryFormat/Dwarf.cpp
+++ b/llvm/lib/BinaryFormat/Dwarf.cpp
@@ -411,6 +411,16 @@ llvm::dwarf::LanguageLowerBound(dwarf::SourceLanguage Lang) {
   }
 }
 
+StringRef llvm::dwarf::LanguageDescription(dwarf::SourceLanguageName lname) {
+  switch (lname) {
+#define HANDLE_DW_LNAME(ID, NAME, DESC, LOWER_BOUND)                           \
+  case DW_LNAME_##NAME:                                                        \
+    return DESC;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+  return "Unknown";
+}
+
 StringRef llvm::dwarf::CaseString(unsigned Case) {
   switch (Case) {
   case DW_ID_case_sensitive:
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 8eaf78157550..339a1f1f2f00 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -8270,6 +8270,7 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,
       IRBuilder<> Builder(Branch);
       if (UI->getParent() != Branch->getParent())
         UI->moveBefore(Branch);
+      UI->dropPoisonGeneratingFlags();
       Value *NewCmp = Builder.CreateCmp(ICmpInst::ICMP_EQ, UI,
                                         ConstantInt::get(UI->getType(), 0));
       LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n");
@@ -8283,6 +8284,7 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,
       IRBuilder<> Builder(Branch);
       if (UI->getParent() != Branch->getParent())
         UI->moveBefore(Branch);
+      UI->dropPoisonGeneratingFlags();
       Value *NewCmp = Builder.CreateCmp(Cmp->getPredicate(), UI,
                                         ConstantInt::get(UI->getType(), 0));
       LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n");
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 031a271de5bd..8573b016d1e5 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -1639,8 +1639,7 @@ bool ComplexDeinterleavingGraph::checkNodes() {
 ComplexDeinterleavingGraph::NodePtr
 ComplexDeinterleavingGraph::identifyRoot(Instruction *RootI) {
   if (auto *Intrinsic = dyn_cast<IntrinsicInst>(RootI)) {
-    if (Intrinsic->getIntrinsicID() !=
-        Intrinsic::experimental_vector_interleave2)
+    if (Intrinsic->getIntrinsicID() != Intrinsic::vector_interleave2)
       return nullptr;
 
     auto *Real = dyn_cast<Instruction>(Intrinsic->getOperand(0));
@@ -1675,7 +1674,7 @@ ComplexDeinterleavingGraph::identifyDeinterleave(Instruction *Real,
   Value *FinalValue = nullptr;
   if (match(Real, m_ExtractValue<0>(m_Instruction(I))) &&
       match(Imag, m_ExtractValue<1>(m_Specific(I))) &&
-      match(I, m_Intrinsic<Intrinsic::experimental_vector_deinterleave2>(
+      match(I, m_Intrinsic<Intrinsic::vector_deinterleave2>(
                    m_Value(FinalValue)))) {
     NodePtr PlaceholderNode = prepareCompositeNode(
         llvm::ComplexDeinterleavingOperation::Deinterleave, Real, Imag);
@@ -1960,13 +1959,11 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
       // Splats that are not constant are interleaved where they are located
       Instruction *InsertPoint = (I->comesBefore(R) ? R : I)->getNextNode();
       IRBuilder<> IRB(InsertPoint);
-      ReplacementNode =
-          IRB.CreateIntrinsic(Intrinsic::experimental_vector_interleave2, NewTy,
-                              {Node->Real, Node->Imag});
+      ReplacementNode = IRB.CreateIntrinsic(Intrinsic::vector_interleave2,
+                                            NewTy, {Node->Real, Node->Imag});
     } else {
-      ReplacementNode =
-          Builder.CreateIntrinsic(Intrinsic::experimental_vector_interleave2,
-                                  NewTy, {Node->Real, Node->Imag});
+      ReplacementNode = Builder.CreateIntrinsic(
+          Intrinsic::vector_interleave2, NewTy, {Node->Real, Node->Imag});
     }
     break;
   }
@@ -1991,9 +1988,8 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
     auto *B = replaceNode(Builder, Node->Operands[1]);
     auto *NewMaskTy = VectorType::getDoubleElementsVectorType(
         cast<VectorType>(MaskReal->getType()));
-    auto *NewMask =
-        Builder.CreateIntrinsic(Intrinsic::experimental_vector_interleave2,
-                                NewMaskTy, {MaskReal, MaskImag});
+    auto *NewMask = Builder.CreateIntrinsic(Intrinsic::vector_interleave2,
+                                            NewMaskTy, {MaskReal, MaskImag});
     ReplacementNode = Builder.CreateSelect(NewMask, A, B);
     break;
   }
@@ -2021,8 +2017,8 @@ void ComplexDeinterleavingGraph::processReductionOperation(
   Value *InitImag = OldPHIImag->getIncomingValueForBlock(Incoming);
 
   IRBuilder<> Builder(Incoming->getTerminator());
-  auto *NewInit = Builder.CreateIntrinsic(
-      Intrinsic::experimental_vector_interleave2, NewVTy, {InitReal, InitImag});
+  auto *NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy,
+                                          {InitReal, InitImag});
 
   NewPHI->addIncoming(NewInit, Incoming);
   NewPHI->addIncoming(OperationReplacement, BackEdge);
@@ -2034,9 +2030,9 @@ void ComplexDeinterleavingGraph::processReductionOperation(
 
   Builder.SetInsertPoint(
       &*FinalReductionReal->getParent()->getFirstInsertionPt());
-  auto *Deinterleave = Builder.CreateIntrinsic(
-      Intrinsic::experimental_vector_deinterleave2,
-      OperationReplacement->getType(), OperationReplacement);
+  auto *Deinterleave = Builder.CreateIntrinsic(Intrinsic::vector_deinterleave2,
+                                               OperationReplacement->getType(),
+                                               OperationReplacement);
 
   auto *NewReal = Builder.CreateExtractValue(Deinterleave, (uint64_t)0);
   FinalReductionReal->replaceUsesOfWith(Real, NewReal);
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
index 123bf21f657c..fb33801a3a33 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
@@ -77,7 +77,7 @@ bool CombinerHelper::matchExtractVectorElement(MachineInstr &MI,
 
   // Fold extractVectorElement(Vector, TOOLARGE) -> undef
   if (IndexC && VectorTy.isFixedVector() &&
-      IndexC->getZExtValue() >= VectorTy.getNumElements() &&
+      IndexC->uge(VectorTy.getNumElements()) &&
       isLegalOrBeforeLegalizer({TargetOpcode::G_IMPLICIT_DEF, {DstTy}})) {
     // For fixed-length vectors, it's invalid to extract out-of-range elements.
     MatchInfo = [=](MachineIRBuilder &B) { B.buildUndef(Dst); };
@@ -324,3 +324,26 @@ bool CombinerHelper::matchExtractVectorElementWithBuildVectorTrunc(
 
   return true;
 }
+
+bool CombinerHelper::matchInsertVectorElementOOB(MachineInstr &MI,
+                                                 BuildFnTy &MatchInfo) {
+  GInsertVectorElement *Insert = cast<GInsertVectorElement>(&MI);
+
+  Register Dst = Insert->getReg(0);
+  LLT DstTy = MRI.getType(Dst);
+  Register Index = Insert->getIndexReg();
+
+  if (!DstTy.isFixedVector())
+    return false;
+
+  std::optional<ValueAndVReg> MaybeIndex =
+      getIConstantVRegValWithLookThrough(Index, MRI);
+
+  if (MaybeIndex && MaybeIndex->Value.uge(DstTy.getNumElements()) &&
+      isLegalOrBeforeLegalizer({TargetOpcode::G_IMPLICIT_DEF, {DstTy}})) {
+    MatchInfo = [=](MachineIRBuilder &B) { B.buildUndef(Dst); };
+    return true;
+  }
+
+  return false;
+}
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
index 51ab7b6262c6..529e50c8ebe0 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -694,6 +694,20 @@ unsigned GISelKnownBits::computeNumSignBits(Register R,
     const MachineMemOperand *MMO = *MI.memoperands_begin();
     return TyBits - MMO->getSizeInBits().getValue();
   }
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_OR:
+  case TargetOpcode::G_XOR: {
+    Register Src1 = MI.getOperand(1).getReg();
+    unsigned Src1NumSignBits =
+        computeNumSignBits(Src1, DemandedElts, Depth + 1);
+    if (Src1NumSignBits != 1) {
+      Register Src2 = MI.getOperand(2).getReg();
+      unsigned Src2NumSignBits =
+          computeNumSignBits(Src2, DemandedElts, Depth + 1);
+      FirstAnswer = std::min(Src1NumSignBits, Src2NumSignBits);
+    }
+    break;
+  }
   case TargetOpcode::G_TRUNC: {
     Register Src = MI.getOperand(1).getReg();
     LLT SrcTy = MRI.getType(Src);
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 8cf392ab0567..e26c6ca3d616 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1804,7 +1804,7 @@ bool IRTranslator::translateTrap(const CallInst &CI,
 
 bool IRTranslator::translateVectorInterleave2Intrinsic(
     const CallInst &CI, MachineIRBuilder &MIRBuilder) {
-  assert(CI.getIntrinsicID() == Intrinsic::experimental_vector_interleave2 &&
+  assert(CI.getIntrinsicID() == Intrinsic::vector_interleave2 &&
          "This function can only be called on the interleave2 intrinsic!");
   // Canonicalize interleave2 to G_SHUFFLE_VECTOR (similar to SelectionDAG).
   Register Op0 = getOrCreateVReg(*CI.getOperand(0));
@@ -1820,7 +1820,7 @@ bool IRTranslator::translateVectorInterleave2Intrinsic(
 
 bool IRTranslator::translateVectorDeinterleave2Intrinsic(
     const CallInst &CI, MachineIRBuilder &MIRBuilder) {
-  assert(CI.getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2 &&
+  assert(CI.getIntrinsicID() == Intrinsic::vector_deinterleave2 &&
          "This function can only be called on the deinterleave2 intrinsic!");
   // Canonicalize deinterleave2 to shuffles that extract sub-vectors (similar to
   // SelectionDAG).
@@ -2223,7 +2223,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     // addresses. We can treat it like a normal dbg_value intrinsic here; to
     // benefit from the full analysis of stack/SSA locations, GlobalISel would
     // need to register for and use the AssignmentTrackingAnalysis pass.
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   case Intrinsic::dbg_value: {
     // This form of DBG_VALUE is target-independent.
     const DbgValueInst &DI = cast<DbgValueInst>(CI);
@@ -2572,15 +2572,15 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     return true;
   }
 
-  case Intrinsic::experimental_vector_interleave2:
-  case Intrinsic::experimental_vector_deinterleave2: {
+  case Intrinsic::vector_interleave2:
+  case Intrinsic::vector_deinterleave2: {
     // Both intrinsics have at least one operand.
     Value *Op0 = CI.getOperand(0);
     LLT ResTy = getLLTForType(*Op0->getType(), MIRBuilder.getDataLayout());
     if (!ResTy.isFixedVector())
       return false;
 
-    if (CI.getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
+    if (CI.getIntrinsicID() == Intrinsic::vector_interleave2)
       return translateVectorInterleave2Intrinsic(CI, MIRBuilder);
 
     return translateVectorDeinterleave2Intrinsic(CI, MIRBuilder);
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index ae43e9ccf611..4e3781cb4e9d 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -12,6 +12,7 @@
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/CodeGenCommonISel.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
@@ -28,6 +29,7 @@
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/Constants.h"
@@ -1709,3 +1711,84 @@ bool llvm::isPreISelGenericFloatingPointOpcode(unsigned Opc) {
     return false;
   }
 }
+
+namespace {
+enum class UndefPoisonKind {
+  PoisonOnly = (1 << 0),
+  UndefOnly = (1 << 1),
+  UndefOrPoison = PoisonOnly | UndefOnly,
+};
+}
+
+[[maybe_unused]] static bool includesPoison(UndefPoisonKind Kind) {
+  return (unsigned(Kind) & unsigned(UndefPoisonKind::PoisonOnly)) != 0;
+}
+
+[[maybe_unused]] static bool includesUndef(UndefPoisonKind Kind) {
+  return (unsigned(Kind) & unsigned(UndefPoisonKind::UndefOnly)) != 0;
+}
+
+static bool canCreateUndefOrPoison(Register Reg, const MachineRegisterInfo &MRI,
+                                   bool ConsiderFlagsAndMetadata,
+                                   UndefPoisonKind Kind) {
+  MachineInstr *RegDef = MRI.getVRegDef(Reg);
+
+  switch (RegDef->getOpcode()) {
+  case TargetOpcode::G_FREEZE:
+    return false;
+  default:
+    return true;
+  }
+}
+
+static bool isGuaranteedNotToBeUndefOrPoison(Register Reg,
+                                             const MachineRegisterInfo &MRI,
+                                             unsigned Depth,
+                                             UndefPoisonKind Kind) {
+  if (Depth >= MaxAnalysisRecursionDepth)
+    return false;
+
+  MachineInstr *RegDef = MRI.getVRegDef(Reg);
+
+  switch (RegDef->getOpcode()) {
+  case TargetOpcode::G_FREEZE:
+    return true;
+  case TargetOpcode::G_IMPLICIT_DEF:
+    return !includesUndef(Kind);
+  default:
+    return false;
+  }
+}
+
+bool llvm::canCreateUndefOrPoison(Register Reg, const MachineRegisterInfo &MRI,
+                                  bool ConsiderFlagsAndMetadata) {
+  return ::canCreateUndefOrPoison(Reg, MRI, ConsiderFlagsAndMetadata,
+                                  UndefPoisonKind::UndefOrPoison);
+}
+
+bool canCreatePoison(Register Reg, const MachineRegisterInfo &MRI,
+                     bool ConsiderFlagsAndMetadata = true) {
+  return ::canCreateUndefOrPoison(Reg, MRI, ConsiderFlagsAndMetadata,
+                                  UndefPoisonKind::PoisonOnly);
+}
+
+bool llvm::isGuaranteedNotToBeUndefOrPoison(Register Reg,
+                                            const MachineRegisterInfo &MRI,
+                                            unsigned Depth) {
+  return ::isGuaranteedNotToBeUndefOrPoison(Reg, MRI, Depth,
+                                            UndefPoisonKind::UndefOrPoison);
+}
+
+bool llvm::isGuaranteedNotToBePoison(Register Reg,
+                                     const MachineRegisterInfo &MRI,
+                                     unsigned Depth) {
+  return ::isGuaranteedNotToBeUndefOrPoison(Reg, MRI, Depth,
+                                            UndefPoisonKind::PoisonOnly);
+}
+
+bool llvm::isGuaranteedNotToBeUndef(Register Reg,
+                                    const MachineRegisterInfo &MRI,
+                                    unsigned Depth) {
+  return ::isGuaranteedNotToBeUndefOrPoison(Reg, MRI, Depth,
+                                            UndefPoisonKind::UndefOnly);
+}
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 8989eabbe6df..8c9065aec7fa 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -535,9 +535,9 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
       // At present, we only have intrinsics to represent (de)interleaving
       // with a factor of 2.
-      if (II->getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2)
+      if (II->getIntrinsicID() == Intrinsic::vector_deinterleave2)
         Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
-      if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
+      if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
         Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
     }
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index aa746f1c7b7b..4b81185c6e31 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -530,6 +530,7 @@ namespace {
     bool refineExtractVectorEltIntoMultipleNarrowExtractVectorElts(SDNode *N);
 
     SDValue visitSTORE(SDNode *N);
+    SDValue visitATOMIC_STORE(SDNode *N);
     SDValue visitLIFETIME_END(SDNode *N);
     SDValue visitINSERT_VECTOR_ELT(SDNode *N);
     SDValue visitEXTRACT_VECTOR_ELT(SDNode *N);
@@ -1909,6 +1910,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::BR_CC:              return visitBR_CC(N);
   case ISD::LOAD:               return visitLOAD(N);
   case ISD::STORE:              return visitSTORE(N);
+  case ISD::ATOMIC_STORE:       return visitATOMIC_STORE(N);
   case ISD::INSERT_VECTOR_ELT:  return visitINSERT_VECTOR_ELT(N);
   case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N);
   case ISD::BUILD_VECTOR:       return visitBUILD_VECTOR(N);
@@ -7620,6 +7622,7 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, const SDLoc &DL) {
 static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
                                   SDNode *N) {
   EVT VT = N0.getValueType();
+  unsigned BW = VT.getScalarSizeInBits();
   SDLoc DL(N);
 
   auto peekThroughResize = [](SDValue V) {
@@ -7689,6 +7692,26 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
       peekThroughZext(N0.getOperand(2)) == peekThroughZext(N1.getOperand(1)))
     return N0;
 
+  // Attempt to match a legalized build_pair-esque pattern:
+  // or(shl(aext(Hi),BW/2),zext(Lo))
+  SDValue Lo, Hi;
+  if (sd_match(N0,
+               m_OneUse(m_Shl(m_AnyExt(m_Value(Hi)), m_SpecificInt(BW / 2)))) &&
+      sd_match(N1, m_ZExt(m_Value(Lo))) &&
+      Lo.getScalarValueSizeInBits() == (BW / 2) &&
+      Lo.getValueType() == Hi.getValueType()) {
+    // Fold build_pair(not(Lo),not(Hi)) -> not(build_pair(Lo,Hi)).
+    SDValue NotLo, NotHi;
+    if (sd_match(Lo, m_OneUse(m_Not(m_Value(NotLo)))) &&
+        sd_match(Hi, m_OneUse(m_Not(m_Value(NotHi))))) {
+      Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotLo);
+      Hi = DAG.getNode(ISD::ANY_EXTEND, DL, VT, NotHi);
+      Hi = DAG.getNode(ISD::SHL, DL, VT, Hi,
+                       DAG.getShiftAmountConstant(BW / 2, VT, DL));
+      return DAG.getNOT(DL, DAG.getNode(ISD::OR, DL, VT, Lo, Hi), VT);
+    }
+  }
+
   return SDValue();
 }
 
@@ -15436,6 +15459,12 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
   if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, /*PoisonOnly*/ false))
     return N0;
 
+  // We currently avoid folding freeze over SRA/SRL, due to the problems seen
+  // with (freeze (assert ext)) blocking simplifications of SRA/SRL. See for
+  // example https://reviews.llvm.org/D136529#4120959.
+  if (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)
+    return SDValue();
+
   // Fold freeze(op(x, ...)) -> op(freeze(x), ...).
   // Try to push freeze through instructions that propagate but don't produce
   // poison as far as possible. If an operand of freeze follows three
@@ -15452,6 +15481,26 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
                                           N0.getOpcode() == ISD::BUILD_PAIR ||
                                           N0.getOpcode() == ISD::CONCAT_VECTORS;
 
+  // Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all
+  // ones" or "constant" into something that depends on FrozenUndef. We can
+  // instead pick undef values to keep those properties, while at the same time
+  // folding away the freeze.
+  // If we implement a more general solution for folding away freeze(undef) in
+  // the future, then this special handling can be removed.
+  if (N0.getOpcode() == ISD::BUILD_VECTOR) {
+    SDLoc DL(N0);
+    EVT VT = N0.getValueType();
+    if (llvm::ISD::isBuildVectorAllOnes(N0.getNode()))
+      return DAG.getAllOnesConstant(DL, VT);
+    if (llvm::ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
+      SmallVector<SDValue, 8> NewVecC;
+      for (const SDValue &Op : N0->op_values())
+        NewVecC.push_back(
+            Op.isUndef() ? DAG.getConstant(0, DL, Op.getValueType()) : Op);
+      return DAG.getBuildVector(VT, DL, NewVecC);
+    }
+  }
+
   SmallSetVector<SDValue, 8> MaybePoisonOperands;
   for (SDValue Op : N0->ops()) {
     if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly*/ false,
@@ -21096,6 +21145,24 @@ SDValue DAGCombiner::replaceStoreOfInsertLoad(StoreSDNode *ST) {
                       ST->getMemOperand()->getFlags());
 }
 
+SDValue DAGCombiner::visitATOMIC_STORE(SDNode *N) {
+  AtomicSDNode *ST = cast<AtomicSDNode>(N);
+  SDValue Val = ST->getVal();
+  EVT VT = Val.getValueType();
+  EVT MemVT = ST->getMemoryVT();
+
+  if (MemVT.bitsLT(VT)) { // Is truncating store
+    APInt TruncDemandedBits = APInt::getLowBitsSet(VT.getScalarSizeInBits(),
+                                                   MemVT.getScalarSizeInBits());
+    // See if we can simplify the operation with SimplifyDemandedBits, which
+    // only works if the value has a single use.
+    if (SimplifyDemandedBits(Val, TruncDemandedBits))
+      return SDValue(N, 0);
+  }
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitSTORE(SDNode *N) {
   StoreSDNode *ST  = cast<StoreSDNode>(N);
   SDValue Chain = ST->getChain();
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index aefedd060f89..ef9f78335519 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1424,7 +1424,7 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
     // happened (such as an optimised function being always-inlined into an
     // optnone function). We will not be using the extra information in the
     // dbg.assign in that case, just use its dbg.value fields.
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   case Intrinsic::dbg_value: {
     // This form of DBG_VALUE is target-independent.
     const DbgValueInst *DI = cast<DbgValueInst>(II);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 24f69ea1b742..46e54b5366d6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3195,7 +3195,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       break;
     }
 
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   }
   case ISD::BITCAST:
     if ((Tmp1 = EmitStackConvert(Node->getOperand(0), Node->getValueType(0),
@@ -3556,6 +3556,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       Results.push_back(Expanded);
     break;
   }
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM: {
+    if (SDValue Expanded = TLI.expandFMINIMUM_FMAXIMUM(Node, DAG))
+      Results.push_back(Expanded);
+    break;
+  }
   case ISD::FSIN:
   case ISD::FCOS: {
     EVT VT = Node->getValueType(0);
@@ -5575,6 +5581,21 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     Results.push_back(NewAtomic.getValue(1));
     break;
   }
+  case ISD::ATOMIC_LOAD: {
+    AtomicSDNode *AM = cast<AtomicSDNode>(Node);
+    SDLoc SL(Node);
+    assert(NVT.getSizeInBits() == OVT.getSizeInBits() &&
+           "unexpected promotion type");
+    assert(AM->getMemoryVT().getSizeInBits() == NVT.getSizeInBits() &&
+           "unexpected atomic_load with illegal type");
+
+    SDValue NewAtomic =
+        DAG.getAtomic(ISD::ATOMIC_LOAD, SL, NVT, DAG.getVTList(NVT, MVT::Other),
+                      {AM->getChain(), AM->getBasePtr()}, AM->getMemOperand());
+    Results.push_back(DAG.getNode(ISD::BITCAST, SL, OVT, NewAtomic));
+    Results.push_back(NewAtomic.getValue(1));
+    break;
+  }
   case ISD::SPLAT_VECTOR: {
     SDValue Scalar = Node->getOperand(0);
     MVT ScalarType = Scalar.getSimpleValueType();
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 7685bc73cf96..abe5be763825 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2449,6 +2449,9 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
       R = PromoteFloatRes_STRICT_FP_ROUND(N);
       break;
     case ISD::LOAD:       R = PromoteFloatRes_LOAD(N); break;
+    case ISD::ATOMIC_LOAD:
+      R = PromoteFloatRes_ATOMIC_LOAD(N);
+      break;
     case ISD::SELECT:     R = PromoteFloatRes_SELECT(N); break;
     case ISD::SELECT_CC:  R = PromoteFloatRes_SELECT_CC(N); break;
 
@@ -2695,6 +2698,25 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_LOAD(SDNode *N) {
   return DAG.getNode(GetPromotionOpcode(VT, NVT), SDLoc(N), NVT, newL);
 }
 
+SDValue DAGTypeLegalizer::PromoteFloatRes_ATOMIC_LOAD(SDNode *N) {
+  AtomicSDNode *AM = cast<AtomicSDNode>(N);
+  EVT VT = AM->getValueType(0);
+
+  // Load the value as an integer value with the same number of bits.
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+  SDValue newL = DAG.getAtomic(
+      ISD::ATOMIC_LOAD, SDLoc(N), IVT, DAG.getVTList(IVT, MVT::Other),
+      {AM->getChain(), AM->getBasePtr()}, AM->getMemOperand());
+
+  // Legalize the chain result by replacing uses of the old value chain with the
+  // new one
+  ReplaceValueWith(SDValue(N, 1), newL.getValue(1));
+
+  // Convert the integer value to the desired FP type
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  return DAG.getNode(GetPromotionOpcode(VT, IVT), SDLoc(N), NVT, newL);
+}
+
 // Construct a new SELECT node with the promoted true- and false- values.
 SDValue DAGTypeLegalizer::PromoteFloatRes_SELECT(SDNode *N) {
   SDValue TrueVal = GetPromotedFloat(N->getOperand(1));
@@ -2855,6 +2877,9 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
   case ISD::FFREXP:      R = SoftPromoteHalfRes_FFREXP(N); break;
 
   case ISD::LOAD:        R = SoftPromoteHalfRes_LOAD(N); break;
+  case ISD::ATOMIC_LOAD:
+    R = SoftPromoteHalfRes_ATOMIC_LOAD(N);
+    break;
   case ISD::SELECT:      R = SoftPromoteHalfRes_SELECT(N); break;
   case ISD::SELECT_CC:   R = SoftPromoteHalfRes_SELECT_CC(N); break;
   case ISD::SINT_TO_FP:
@@ -3039,6 +3064,20 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_LOAD(SDNode *N) {
   return NewL;
 }
 
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_ATOMIC_LOAD(SDNode *N) {
+  AtomicSDNode *AM = cast<AtomicSDNode>(N);
+
+  // Load the value as an integer value with the same number of bits.
+  SDValue NewL = DAG.getAtomic(
+      ISD::ATOMIC_LOAD, SDLoc(N), MVT::i16, DAG.getVTList(MVT::i16, MVT::Other),
+      {AM->getChain(), AM->getBasePtr()}, AM->getMemOperand());
+
+  // Legalize the chain result by replacing uses of the old value chain with the
+  // new one
+  ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
+  return NewL;
+}
+
 SDValue DAGTypeLegalizer::SoftPromoteHalfRes_SELECT(SDNode *N) {
   SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
   SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 9c855e558553..4a2c7b355eb5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -691,6 +691,7 @@ private:
   SDValue PromoteFloatRes_FP_ROUND(SDNode *N);
   SDValue PromoteFloatRes_STRICT_FP_ROUND(SDNode *N);
   SDValue PromoteFloatRes_LOAD(SDNode *N);
+  SDValue PromoteFloatRes_ATOMIC_LOAD(SDNode *N);
   SDValue PromoteFloatRes_SELECT(SDNode *N);
   SDValue PromoteFloatRes_SELECT_CC(SDNode *N);
   SDValue PromoteFloatRes_UnaryOp(SDNode *N);
@@ -734,6 +735,7 @@ private:
   SDValue SoftPromoteHalfRes_FFREXP(SDNode *N);
   SDValue SoftPromoteHalfRes_FP_ROUND(SDNode *N);
   SDValue SoftPromoteHalfRes_LOAD(SDNode *N);
+  SDValue SoftPromoteHalfRes_ATOMIC_LOAD(SDNode *N);
   SDValue SoftPromoteHalfRes_SELECT(SDNode *N);
   SDValue SoftPromoteHalfRes_SELECT_CC(SDNode *N);
   SDValue SoftPromoteHalfRes_UnaryOp(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 1de43a4f60e3..8f87ee8e0939 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1049,6 +1049,13 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
       return;
     }
     break;
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
+    if (SDValue Expanded = TLI.expandFMINIMUM_FMAXIMUM(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 224c0c5ee970..dfbfaa8c894f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5063,6 +5063,7 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op,
   case ISD::VALUETYPE:
   case ISD::FrameIndex:
   case ISD::TargetFrameIndex:
+  case ISD::CopyFromReg:
     return true;
 
   case ISD::UNDEF:
@@ -5136,6 +5137,16 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::FREEZE:
   case ISD::CONCAT_VECTORS:
   case ISD::INSERT_SUBVECTOR:
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT:
+  case ISD::MULHU:
+  case ISD::MULHS:
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
   case ISD::AND:
   case ISD::XOR:
   case ISD::ROTL:
@@ -5156,6 +5167,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::BUILD_PAIR:
     return false;
 
+  case ISD::SELECT_CC:
   case ISD::SETCC: {
     // Integer setcc cannot create undef or poison.
     if (Op.getOperand(0).getValueType().isInteger())
@@ -5165,7 +5177,8 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
     // based on options and flags. The options and flags also cause special
     // nonan condition codes to be used. Those condition codes may be preserved
     // even if the nonan flag is dropped somewhere.
-    ISD::CondCode CCCode = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    unsigned CCOp = Opcode == ISD::SETCC ? 2 : 4;
+    ISD::CondCode CCCode = cast<CondCodeSDNode>(Op.getOperand(CCOp))->get();
     if (((unsigned)CCCode & 0x10U))
       return true;
 
@@ -5182,6 +5195,8 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
     return false;
 
   case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SRA:
     // If the max shift amount isn't in range, then the shift can create poison.
     return !getValidMaximumShiftAmountConstant(Op, DemandedElts);
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 0db484a5e06b..5caf868c83a2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7930,19 +7930,19 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
              DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ResultVT, Vec, Index));
     return;
   }
-  case Intrinsic::experimental_vector_reverse:
+  case Intrinsic::vector_reverse:
     visitVectorReverse(I);
     return;
-  case Intrinsic::experimental_vector_splice:
+  case Intrinsic::vector_splice:
     visitVectorSplice(I);
     return;
   case Intrinsic::callbr_landingpad:
     visitCallBrLandingPad(I);
     return;
-  case Intrinsic::experimental_vector_interleave2:
+  case Intrinsic::vector_interleave2:
     visitVectorInterleave(I);
     return;
-  case Intrinsic::experimental_vector_deinterleave2:
+  case Intrinsic::vector_deinterleave2:
     visitVectorDeinterleave(I);
     return;
   case Intrinsic::experimental_convergence_anchor:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index c938b3996be3..cdc1227fd572 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8381,6 +8381,64 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
   return SDValue();
 }
 
+SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N,
+                                                SelectionDAG &DAG) const {
+  SDLoc DL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  unsigned Opc = N->getOpcode();
+  EVT VT = N->getValueType(0);
+  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  bool IsMax = Opc == ISD::FMAXIMUM;
+
+  if (VT.isVector() &&
+      isOperationLegalOrCustomOrPromote(Opc, VT.getScalarType()))
+    return SDValue();
+
+  // First, implement comparison not propagating NaN. If no native fmin or fmax
+  // available, use plain select with setcc instead.
+  SDValue MinMax;
+  unsigned CompOpcIeee = IsMax ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE;
+  unsigned CompOpc = IsMax ? ISD::FMAXNUM : ISD::FMINNUM;
+  if (isOperationLegalOrCustom(CompOpcIeee, VT)) {
+    MinMax = DAG.getNode(CompOpcIeee, DL, VT, LHS, RHS);
+  } else if (isOperationLegalOrCustom(CompOpc, VT)) {
+    MinMax = DAG.getNode(CompOpc, DL, VT, LHS, RHS);
+  } else {
+    // NaN (if exists) will be propagated later, so orderness doesn't matter.
+    SDValue Compare =
+        DAG.getSetCC(DL, CCVT, LHS, RHS, IsMax ? ISD::SETGT : ISD::SETLT);
+    MinMax = DAG.getSelect(DL, VT, Compare, LHS, RHS);
+  }
+
+  // Propagate any NaN of both operands
+  if (!N->getFlags().hasNoNaNs() &&
+      (!DAG.isKnownNeverNaN(RHS) || !DAG.isKnownNeverNaN(LHS))) {
+    ConstantFP *FPNaN = ConstantFP::get(
+        *DAG.getContext(), APFloat::getNaN(DAG.EVTToAPFloatSemantics(VT)));
+    MinMax = DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, LHS, RHS, ISD::SETUO),
+                           DAG.getConstantFP(*FPNaN, DL, VT), MinMax);
+  }
+
+  // fminimum/fmaximum requires -0.0 less than +0.0
+  if (!N->getFlags().hasNoSignedZeros() && !DAG.isKnownNeverZeroFloat(RHS) &&
+      !DAG.isKnownNeverZeroFloat(LHS)) {
+    SDValue IsZero = DAG.getSetCC(DL, CCVT, MinMax,
+                                  DAG.getConstantFP(0.0, DL, VT), ISD::SETEQ);
+    SDValue TestZero =
+        DAG.getTargetConstant(IsMax ? fcPosZero : fcNegZero, DL, MVT::i32);
+    SDValue LCmp = DAG.getSelect(
+        DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, LHS, TestZero), LHS,
+        MinMax);
+    SDValue RCmp = DAG.getSelect(
+        DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, RHS, TestZero), RHS,
+        LCmp);
+    MinMax = DAG.getSelect(DL, VT, IsZero, RCmp, MinMax);
+  }
+
+  return MinMax;
+}
+
 /// Returns a true value if if this FPClassTest can be performed with an ordered
 /// fcmp to 0, and a false value if it's an unordered fcmp to 0. Returns
 /// std::nullopt if it cannot be performed as a compare with 0.
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 634b2dd5119e..5b02b0e94dda 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -846,6 +846,18 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
         return false; // No other 'aarch64.sve.bf*'.
       }
 
+      if (Name.consume_front("addqv")) {
+        // 'aarch64.sve.addqv'.
+        if (!F->getReturnType()->isFPOrFPVectorTy())
+          return false;
+
+        auto Args = F->getFunctionType()->params();
+        Type *Tys[] = {F->getReturnType(), Args[1]};
+        NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                          Intrinsic::aarch64_sve_faddqv, Tys);
+        return true;
+      }
+
       if (Name.consume_front("ld")) {
         // 'aarch64.sve.ld*'.
         static const Regex LdRegex("^[234](.nxv[a-z0-9]+|$)");
@@ -1080,17 +1092,24 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
     break;
   case 'e':
     if (Name.consume_front("experimental.vector.")) {
-      Intrinsic::ID ID = StringSwitch<Intrinsic::ID>(Name)
-                             .StartsWith("extract.", Intrinsic::vector_extract)
-                             .StartsWith("insert.", Intrinsic::vector_insert)
-                             .Default(Intrinsic::not_intrinsic);
+      Intrinsic::ID ID =
+          StringSwitch<Intrinsic::ID>(Name)
+              .StartsWith("extract.", Intrinsic::vector_extract)
+              .StartsWith("insert.", Intrinsic::vector_insert)
+              .StartsWith("splice.", Intrinsic::vector_splice)
+              .StartsWith("reverse.", Intrinsic::vector_reverse)
+              .StartsWith("interleave2.", Intrinsic::vector_interleave2)
+              .StartsWith("deinterleave2.", Intrinsic::vector_deinterleave2)
+              .Default(Intrinsic::not_intrinsic);
       if (ID != Intrinsic::not_intrinsic) {
         const auto *FT = F->getFunctionType();
         SmallVector<Type *, 2> Tys;
-        if (ID == Intrinsic::vector_extract)
+        if (ID == Intrinsic::vector_extract ||
+            ID == Intrinsic::vector_interleave2)
           // Extracting overloads the return type.
           Tys.push_back(FT->getReturnType());
-        Tys.push_back(FT->getParamType(0));
+        if (ID != Intrinsic::vector_interleave2)
+          Tys.push_back(FT->getParamType(0));
         if (ID == Intrinsic::vector_insert)
           // Inserting overloads the inserted type.
           Tys.push_back(FT->getParamType(1));
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index e66fe73425e8..545940dd86f9 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -735,6 +735,10 @@ void Function::addDereferenceableOrNullParamAttr(unsigned ArgNo,
                                                                   ArgNo, Bytes);
 }
 
+void Function::addRangeRetAttr(const ConstantRange &CR) {
+  AttributeSets = AttributeSets.addRangeRetAttr(getContext(), CR);
+}
+
 DenormalMode Function::getDenormalMode(const fltSemantics &FPType) const {
   if (&FPType == &APFloat::IEEEsingle()) {
     DenormalMode Mode = getDenormalModeF32Raw();
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index d6746d1d4382..9ec5a7deeec6 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -1171,8 +1171,7 @@ Value *IRBuilderBase::CreateVectorReverse(Value *V, const Twine &Name) {
   auto *Ty = cast<VectorType>(V->getType());
   if (isa<ScalableVectorType>(Ty)) {
     Module *M = BB->getParent()->getParent();
-    Function *F = Intrinsic::getDeclaration(
-        M, Intrinsic::experimental_vector_reverse, Ty);
+    Function *F = Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, Ty);
     return Insert(CallInst::Create(F, V), Name);
   }
   // Keep the original behaviour for fixed vector
@@ -1191,8 +1190,7 @@ Value *IRBuilderBase::CreateVectorSplice(Value *V1, Value *V2, int64_t Imm,
 
   if (auto *VTy = dyn_cast<ScalableVectorType>(V1->getType())) {
     Module *M = BB->getParent()->getParent();
-    Function *F = Intrinsic::getDeclaration(
-        M, Intrinsic::experimental_vector_splice, VTy);
+    Function *F = Intrinsic::getDeclaration(M, Intrinsic::vector_splice, VTy);
 
     Value *Ops[] = {V1, V2, getInt32(Imm)};
     return Insert(CallInst::Create(F, Ops), Name);
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index d2babc748731..7ad1ad4cddb7 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -2889,7 +2889,7 @@ bool ShuffleVectorInst::isOneUseSingleSourceMask(int VF) const {
 bool ShuffleVectorInst::isInterleave(unsigned Factor) {
   FixedVectorType *OpTy = dyn_cast<FixedVectorType>(getOperand(0)->getType());
   // shuffle_vector can only interleave fixed length vectors - for scalable
-  // vectors, see the @llvm.experimental.vector.interleave2 intrinsic
+  // vectors, see the @llvm.vector.interleave2 intrinsic
   if (!OpTy)
     return false;
   unsigned OpNumElts = OpTy->getNumElements();
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index 8faeb4e9951f..6b6420ae41c9 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -119,7 +119,8 @@ static ValueAsMetadata *getAsMetadata(Value *V) {
 }
 
 void DbgVariableIntrinsic::replaceVariableLocationOp(Value *OldValue,
-                                                     Value *NewValue) {
+                                                     Value *NewValue,
+                                                     bool AllowEmpty) {
   // If OldValue is used as the address part of a dbg.assign intrinsic replace
   // it with NewValue and return true.
   auto ReplaceDbgAssignAddress = [this, OldValue, NewValue]() -> bool {
@@ -136,6 +137,8 @@ void DbgVariableIntrinsic::replaceVariableLocationOp(Value *OldValue,
   auto Locations = location_ops();
   auto OldIt = find(Locations, OldValue);
   if (OldIt == Locations.end()) {
+    if (AllowEmpty || DbgAssignAddrReplaced)
+      return;
     assert(DbgAssignAddrReplaced &&
            "OldValue must be dbg.assign addr if unused in DIArgList");
     return;
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index e42cc7e260ef..430e2ce89f6a 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6019,7 +6019,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
 
     break;
   }
-  case Intrinsic::experimental_vector_splice: {
+  case Intrinsic::vector_splice: {
     VectorType *VecTy = cast<VectorType>(Call.getType());
     int64_t Idx = cast<ConstantInt>(Call.getArgOperand(2))->getSExtValue();
     int64_t KnownMinNumElements = VecTy->getElementCount().getKnownMinValue();
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 53060df7f503..21cad1de0ced 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -114,12 +114,12 @@ void llvm::computeLTOCacheKey(
   auto AddUnsigned = [&](unsigned I) {
     uint8_t Data[4];
     support::endian::write32le(Data, I);
-    Hasher.update(ArrayRef<uint8_t>{Data, 4});
+    Hasher.update(Data);
   };
   auto AddUint64 = [&](uint64_t I) {
     uint8_t Data[8];
     support::endian::write64le(Data, I);
-    Hasher.update(ArrayRef<uint8_t>{Data, 8});
+    Hasher.update(Data);
   };
   AddString(Conf.CPU);
   // FIXME: Hash more of Options. For now all clients initialize Options from
diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp
index 1a82e45763a2..2e3ebe3d9073 100644
--- a/llvm/lib/MCA/InstrBuilder.cpp
+++ b/llvm/lib/MCA/InstrBuilder.cpp
@@ -542,8 +542,7 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI,
   const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
   if (SCDesc.NumMicroOps == MCSchedClassDesc::InvalidNumMicroOps) {
     return make_error<InstructionError<MCInst>>(
-        "found an unsupported instruction in the input assembly sequence.",
-        MCI);
+        "found an unsupported instruction in the input assembly sequence", MCI);
   }
 
   LLVM_DEBUG(dbgs() << "\n\t\tOpcode Name= " << MCII.getName(Opcode) << '\n');
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 90ba3b541553..594549034cc8 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -963,7 +963,8 @@ PassBuilder::buildInlinerPipeline(OptimizationLevel Level,
   MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
       RequireAnalysisPass<ShouldNotRunFunctionPassesAnalysis, Function>()));
 
-  MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0));
+  if (!isLTOPreLink(Phase))
+    MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0));
 
   // Make sure we don't affect potential future NoRerun CGSCC adaptors.
   MIWP.addLateModulePass(createModuleToFunctionPassAdaptor(
@@ -1005,8 +1006,9 @@ PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level,
       buildFunctionSimplificationPipeline(Level, Phase),
       PTO.EagerlyInvalidateAnalyses));
 
-  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
-      CoroSplitPass(Level != OptimizationLevel::O0)));
+  if (!isLTOPreLink(Phase))
+    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+        CoroSplitPass(Level != OptimizationLevel::O0)));
 
   return MPM;
 }
@@ -1183,7 +1185,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   // and argument promotion.
   MPM.addPass(DeadArgumentEliminationPass());
 
-  MPM.addPass(CoroCleanupPass());
+  if (!isLTOPreLink(Phase))
+    MPM.addPass(CoroCleanupPass());
 
   // Optimize globals now that functions are fully simplified.
   MPM.addPass(GlobalOptPass());
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index cefb6af12d00..ba21e01abfba 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -1520,54 +1520,38 @@ IndexedMemProfReader::getMemProfRecord(const uint64_t FuncNameHash) const {
 
   // Setup a callback to convert from frame ids to frame using the on-disk
   // FrameData hash table.
-  std::optional<memprof::FrameId> LastUnmappedFrameId;
-  auto IdToFrameCallback = [&](const memprof::FrameId Id) {
-    auto FrIter = MemProfFrameTable->find(Id);
-    if (FrIter == MemProfFrameTable->end()) {
-      LastUnmappedFrameId = Id;
-      return memprof::Frame(0, 0, 0, false);
-    }
-    return *FrIter;
-  };
-
-  // Setup a callback to convert call stack ids to call stacks using the on-disk
-  // hash table.
-  std::optional<memprof::CallStackId> LastUnmappedCSId;
-  auto CSIdToCallStackCallback = [&](memprof::CallStackId CSId) {
-    llvm::SmallVector<memprof::Frame> Frames;
-    auto CSIter = MemProfCallStackTable->find(CSId);
-    if (CSIter == MemProfCallStackTable->end()) {
-      LastUnmappedCSId = CSId;
-    } else {
-      const llvm::SmallVector<memprof::FrameId> &CS = *CSIter;
-      Frames.reserve(CS.size());
-      for (memprof::FrameId Id : CS)
-        Frames.push_back(IdToFrameCallback(Id));
-    }
-    return Frames;
-  };
+  memprof::FrameIdConverter<MemProfFrameHashTable> FrameIdConv(
+      *MemProfFrameTable.get());
 
   const memprof::IndexedMemProfRecord IndexedRecord = *Iter;
   memprof::MemProfRecord Record;
-  if (MemProfCallStackTable)
-    Record = IndexedRecord.toMemProfRecord(CSIdToCallStackCallback);
-  else
-    Record = memprof::MemProfRecord(IndexedRecord, IdToFrameCallback);
+  if (MemProfCallStackTable) {
+    // Setup a callback to convert call stack ids to call stacks using the
+    // on-disk hash table.
+    memprof::CallStackIdConverter<MemProfCallStackHashTable> CSIdConv(
+        *MemProfCallStackTable.get(), FrameIdConv);
 
-  // Check that all frame ids were successfully converted to frames.
-  if (LastUnmappedFrameId) {
-    return make_error<InstrProfError>(instrprof_error::hash_mismatch,
-                                      "memprof frame not found for frame id " +
-                                          Twine(*LastUnmappedFrameId));
+    Record = IndexedRecord.toMemProfRecord(CSIdConv);
+
+    // Check that all call stack ids were successfully converted to call stacks.
+    if (CSIdConv.LastUnmappedId) {
+      return make_error<InstrProfError>(
+          instrprof_error::hash_mismatch,
+          "memprof call stack not found for call stack id " +
+              Twine(*CSIdConv.LastUnmappedId));
+    }
+  } else {
+    Record = memprof::MemProfRecord(IndexedRecord, FrameIdConv);
   }
 
-  // Check that all call stack ids were successfully converted to call stacks.
-  if (LastUnmappedCSId) {
+  // Check that all frame ids were successfully converted to frames.
+  if (FrameIdConv.LastUnmappedId) {
     return make_error<InstrProfError>(
         instrprof_error::hash_mismatch,
-        "memprof call stack not found for call stack id " +
-            Twine(*LastUnmappedCSId));
+        "memprof frame not found for frame id " +
+            Twine(*FrameIdConv.LastUnmappedId));
   }
+
   return Record;
 }
 
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index e1846fcbffee..b61c59aacc0f 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -657,8 +657,8 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
                        : IndexedInstrProf::ProfVersion::CurrentVersion;
   // The WritePrevVersion handling will either need to be removed or updated
   // if the version is advanced beyond 12.
-  assert(IndexedInstrProf::ProfVersion::CurrentVersion ==
-         IndexedInstrProf::ProfVersion::Version12);
+  static_assert(IndexedInstrProf::ProfVersion::CurrentVersion ==
+                IndexedInstrProf::ProfVersion::Version12);
   if (static_cast<bool>(ProfileKind & InstrProfKind::IRInstrumentation))
     Header.Version |= VARIANT_MASK_IR_PROF;
   if (static_cast<bool>(ProfileKind & InstrProfKind::ContextSensitive))
diff --git a/llvm/lib/Support/SuffixTree.cpp b/llvm/lib/Support/SuffixTree.cpp
index eaa653078e09..c00c7989d1a6 100644
--- a/llvm/lib/Support/SuffixTree.cpp
+++ b/llvm/lib/Support/SuffixTree.cpp
@@ -242,8 +242,8 @@ void SuffixTree::RepeatedSubstringIterator::advance() {
     unsigned Length = Curr->getConcatLen();
 
     // Iterate over each child, saving internal nodes for visiting, and
-    // leaf nodes in LeafChildren. Internal nodes represent individual
-    // strings, which may repeat.
+    // leaf nodes' SuffixIdx in RepeatedSubstringStarts. Internal nodes
+    // represent individual strings, which may repeat.
     for (auto &ChildPair : Curr->Children) {
       // Save all of this node's children for processing.
       if (auto *InternalChild =
diff --git a/llvm/lib/Support/YAMLTraits.cpp b/llvm/lib/Support/YAMLTraits.cpp
index 4aaf59be2ce5..7bb60894b335 100644
--- a/llvm/lib/Support/YAMLTraits.cpp
+++ b/llvm/lib/Support/YAMLTraits.cpp
@@ -718,40 +718,8 @@ void Output::scalarString(StringRef &S, QuotingType MustQuote) {
     outputUpToEndOfLine("''");
     return;
   }
-  if (MustQuote == QuotingType::None) {
-    // Only quote if we must.
-    outputUpToEndOfLine(S);
-    return;
-  }
-
-  const char *const Quote = MustQuote == QuotingType::Single ? "'" : "\"";
-  output(Quote); // Starting quote.
-
-  // When using double-quoted strings (and only in that case), non-printable characters may be
-  // present, and will be escaped using a variety of unicode-scalar and special short-form
-  // escapes. This is handled in yaml::escape.
-  if (MustQuote == QuotingType::Double) {
-    output(yaml::escape(S, /* EscapePrintable= */ false));
-    outputUpToEndOfLine(Quote);
-    return;
-  }
-
-  unsigned i = 0;
-  unsigned j = 0;
-  unsigned End = S.size();
-  const char *Base = S.data();
-
-  // When using single-quoted strings, any single quote ' must be doubled to be escaped.
-  while (j < End) {
-    if (S[j] == '\'') {                    // Escape quotes.
-      output(StringRef(&Base[i], j - i));  // "flush".
-      output(StringLiteral("''"));         // Print it as ''
-      i = j + 1;
-    }
-    ++j;
-  }
-  output(StringRef(&Base[i], j - i));
-  outputUpToEndOfLine(Quote); // Ending quote.
+  output(S, MustQuote);
+  outputUpToEndOfLine("");
 }
 
 void Output::blockScalarString(StringRef &S) {
@@ -801,6 +769,46 @@ void Output::output(StringRef s) {
   Out << s;
 }
 
+void Output::output(StringRef S, QuotingType MustQuote) {
+  if (MustQuote == QuotingType::None) {
+    // Only quote if we must.
+    output(S);
+    return;
+  }
+
+  StringLiteral Quote = MustQuote == QuotingType::Single ? StringLiteral("'")
+                                                         : StringLiteral("\"");
+  output(Quote); // Starting quote.
+
+  // When using double-quoted strings (and only in that case), non-printable
+  // characters may be present, and will be escaped using a variety of
+  // unicode-scalar and special short-form escapes. This is handled in
+  // yaml::escape.
+  if (MustQuote == QuotingType::Double) {
+    output(yaml::escape(S, /* EscapePrintable= */ false));
+    output(Quote);
+    return;
+  }
+
+  unsigned i = 0;
+  unsigned j = 0;
+  unsigned End = S.size();
+  const char *Base = S.data();
+
+  // When using single-quoted strings, any single quote ' must be doubled to be
+  // escaped.
+  while (j < End) {
+    if (S[j] == '\'') {                   // Escape quotes.
+      output(StringRef(&Base[i], j - i)); // "flush".
+      output(StringLiteral("''"));        // Print it as ''
+      i = j + 1;
+    }
+    ++j;
+  }
+  output(StringRef(&Base[i], j - i));
+  output(Quote); // Ending quote.
+}
+
 void Output::outputUpToEndOfLine(StringRef s) {
   output(s);
   if (StateStack.empty() || (!inFlowSeqAnyElement(StateStack.back()) &&
@@ -853,7 +861,7 @@ void Output::newLineCheck(bool EmptySequence) {
 }
 
 void Output::paddedKey(StringRef key) {
-  output(key);
+  output(key, needsQuotes(key, false));
   output(":");
   const char *spaces = "                ";
   if (key.size() < strlen(spaces))
@@ -872,7 +880,7 @@ void Output::flowKey(StringRef Key) {
     Column = ColumnAtMapFlowStart;
     output("  ");
   }
-  output(Key);
+  output(Key, needsQuotes(Key, false));
   output(": ");
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index f2f1c93ea225..4b2ce0d73949 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -76,7 +76,7 @@ def SME2p1Unsupported : AArch64Unsupported;
 
 def SME2Unsupported : AArch64Unsupported {
   let F = !listconcat([HasSME2, HasSVE2orSME2, HasSVE2p1_or_HasSME2, HasSSVE_FP8FMA,
-                      HasSMEF8F16, HasSMEF8F32],
+                      HasSMEF8F16, HasSMEF8F32, HasSMEF16F16orSMEF8F16],
                       SME2p1Unsupported.F);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index 3bf6283b79e9..dddc181b0314 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -178,13 +178,14 @@ void AArch64Arm64ECCallLowering::getThunkArgTypes(
   }
 
   for (unsigned E = FT->getNumParams(); I != E; ++I) {
-    Align ParamAlign = AttrList.getParamAlignment(I).valueOrOne();
 #if 0
     // FIXME: Need more information about argument size; see
     // https://reviews.llvm.org/D132926
     uint64_t ArgSizeBytes = AttrList.getParamArm64ECArgSizeBytes(I);
+    Align ParamAlign = AttrList.getParamAlignment(I).valueOrOne();
 #else
     uint64_t ArgSizeBytes = 0;
+    Align ParamAlign = Align();
 #endif
     Type *Arm64Ty, *X64Ty;
     canonicalizeThunkType(FT->getParamType(I), ParamAlign,
@@ -294,7 +295,7 @@ void AArch64Arm64ECCallLowering::canonicalizeThunkType(
     uint64_t TotalSizeBytes = ElementCnt * ElementSizePerBytes;
     if (ElementTy->isFloatTy() || ElementTy->isDoubleTy()) {
       Out << (ElementTy->isFloatTy() ? "F" : "D") << TotalSizeBytes;
-      if (Alignment.value() >= 8 && !T->isPointerTy())
+      if (Alignment.value() >= 16 && !Ret)
         Out << "a" << Alignment.value();
       Arm64Ty = T;
       if (TotalSizeBytes <= 8) {
@@ -325,7 +326,7 @@ void AArch64Arm64ECCallLowering::canonicalizeThunkType(
   Out << "m";
   if (TypeSize != 4)
     Out << TypeSize;
-  if (Alignment.value() >= 8 && !T->isPointerTy())
+  if (Alignment.value() >= 16 && !Ret)
     Out << "a" << Alignment.value();
   // FIXME: Try to canonicalize Arm64Ty more thoroughly?
   Arm64Ty = T;
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index efda45a72ef4..3a3751a85afd 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -505,15 +505,15 @@ def FeatureSMEF64F64 : SubtargetFeature<"sme-f64f64", "HasSMEF64F64", "true",
 def FeatureSMEI16I64 : SubtargetFeature<"sme-i16i64", "HasSMEI16I64", "true",
   "Enable Scalable Matrix Extension (SME) I16I64 instructions (FEAT_SME_I16I64)", [FeatureSME]>;
 
-def FeatureSMEF16F16 : SubtargetFeature<"sme-f16f16", "HasSMEF16F16", "true",
-  "Enable SME2.1 non-widening Float16 instructions (FEAT_SME_F16F16)", []>;
-
 def FeatureSMEFA64 : SubtargetFeature<"sme-fa64", "HasSMEFA64", "true",
   "Enable the full A64 instruction set in streaming SVE mode (FEAT_SME_FA64)", [FeatureSME, FeatureSVE2]>;
 
 def FeatureSME2 : SubtargetFeature<"sme2", "HasSME2", "true",
   "Enable Scalable Matrix Extension 2 (SME2) instructions", [FeatureSME]>;
 
+def FeatureSMEF16F16 : SubtargetFeature<"sme-f16f16", "HasSMEF16F16", "true",
+  "Enable SME non-widening Float16 instructions (FEAT_SME_F16F16)", [FeatureSME2]>;
+
 def FeatureSME2p1 : SubtargetFeature<"sme2p1", "HasSME2p1", "true",
   "Enable Scalable Matrix Extension 2.1 (FEAT_SME2p1) instructions", [FeatureSME2]>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 419c141121c3..c86c98eed24f 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1296,7 +1296,7 @@ static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
   }
   case AArch64::LDPQpost:
     Imm = -Imm;
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   case AArch64::STPQpre: {
     unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
     unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8e9782c1930c..cb7930f0cdee 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -842,6 +842,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
   setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
+  setOperationAction(ISD::GET_FPMODE, MVT::i32, Custom);
+  setOperationAction(ISD::SET_FPMODE, MVT::i32, Custom);
+  setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);
 
   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
   if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
@@ -4870,6 +4873,65 @@ SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
   return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
 }
 
+SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Chain = Op->getOperand(0);
+
+  // Get current value of FPCR.
+  SDValue Ops[] = {
+      Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
+  SDValue FPCR =
+      DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
+  Chain = FPCR.getValue(1);
+  FPCR = FPCR.getValue(0);
+
+  // Truncate FPCR to 32 bits.
+  SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
+
+  return DAG.getMergeValues({Result, Chain}, DL);
+}
+
+SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Chain = Op->getOperand(0);
+  SDValue Mode = Op->getOperand(1);
+
+  // Extend the specified value to 64 bits.
+  SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
+
+  // Set new value of FPCR.
+  SDValue Ops2[] = {
+      Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};
+  return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
+}
+
+SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Chain = Op->getOperand(0);
+
+  // Get current value of FPCR.
+  SDValue Ops[] = {
+      Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
+  SDValue FPCR =
+      DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
+  Chain = FPCR.getValue(1);
+  FPCR = FPCR.getValue(0);
+
+  // Clear bits that are not reserved.
+  SDValue FPSCRMasked = DAG.getNode(
+      ISD::AND, DL, MVT::i64, FPCR,
+      DAG.getConstant(AArch64::ReservedFPControlBits, DL, MVT::i64));
+
+  // Set new value of FPCR.
+  SDValue Ops2[] = {Chain,
+                    DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
+                    FPSCRMasked};
+  return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
+}
+
 static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
                                  SDLoc DL, bool &IsMLA) {
   bool IsN0SExt = isSignExtended(N0, DAG);
@@ -6484,6 +6546,12 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerGET_ROUNDING(Op, DAG);
   case ISD::SET_ROUNDING:
     return LowerSET_ROUNDING(Op, DAG);
+  case ISD::GET_FPMODE:
+    return LowerGET_FPMODE(Op, DAG);
+  case ISD::SET_FPMODE:
+    return LowerSET_FPMODE(Op, DAG);
+  case ISD::RESET_FPMODE:
+    return LowerRESET_FPMODE(Op, DAG);
   case ISD::MUL:
     return LowerMUL(Op, DAG);
   case ISD::MULHS:
@@ -16330,7 +16398,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     IntrinsicInst *DI, LoadInst *LI) const {
   // Only deinterleave2 supported at present.
-  if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
+  if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
     return false;
 
   // Only a factor of 2 supported at present.
@@ -16405,7 +16473,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
 bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
     IntrinsicInst *II, StoreInst *SI) const {
   // Only interleave2 supported at present.
-  if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
+  if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
     return false;
 
   // Only a factor of 2 supported at present.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 400368a5e130..fbdc4de5617f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -523,6 +523,9 @@ enum Rounding {
 // Bit position of rounding mode bits in FPCR.
 const unsigned RoundingBitsPos = 22;
 
+// Reserved bits should be preserved when modifying FPCR.
+const uint64_t ReservedFPControlBits = 0xfffffffff80040f8;
+
 // Registers used to pass function arguments.
 ArrayRef<MCPhysReg> getGPRArgRegs();
 ArrayRef<MCPhysReg> getFPRArgRegs();
@@ -1128,6 +1131,9 @@ private:
   SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGET_FPMODE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSET_FPMODE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerRESET_FPMODE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 7bf06e71a030..55fecc4b4845 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -6924,19 +6924,26 @@ genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
     assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
            "Unexpected instruction opcode.");
 
+  uint32_t Flags = Root.mergeFlagsWith(*AddMI);
+  Flags &= ~MachineInstr::NoSWrap;
+  Flags &= ~MachineInstr::NoUWrap;
+
   MachineInstrBuilder MIB1 =
       BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
           .addReg(RegA, getKillRegState(RegAIsKill))
-          .addReg(RegB, getKillRegState(RegBIsKill));
+          .addReg(RegB, getKillRegState(RegBIsKill))
+          .setMIFlags(Flags);
   MachineInstrBuilder MIB2 =
       BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
           .addReg(NewVR, getKillRegState(true))
-          .addReg(RegC, getKillRegState(RegCIsKill));
+          .addReg(RegC, getKillRegState(RegCIsKill))
+          .setMIFlags(Flags);
 
   InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
   InsInstrs.push_back(MIB1);
   InsInstrs.push_back(MIB2);
   DelInstrs.push_back(AddMI);
+  DelInstrs.push_back(&Root);
 }
 
 /// When getMachineCombinerPatterns() finds potential patterns,
@@ -6966,13 +6973,13 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     // ==> (A - B) - C
     genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
                      InstrIdxForVirtReg);
-    break;
+    return;
   case AArch64MachineCombinerPattern::SUBADD_OP2:
     // A - (B + C)
     // ==> (A - C) - B
     genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
                      InstrIdxForVirtReg);
-    break;
+    return;
   case AArch64MachineCombinerPattern::MULADDW_OP1:
   case AArch64MachineCombinerPattern::MULADDX_OP1:
     // MUL I=A,B,0
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index a7abb58064a5..2159116d1ab7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -230,6 +230,12 @@ def HasSVE2p1_or_HasSME2
 def HasSVE2p1_or_HasSME2p1
     : Predicate<"Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()">,
                  AssemblerPredicateWithAll<(any_of FeatureSME2p1, FeatureSVE2p1), "sme2p1 or sve2p1">;
+
+def HasSMEF16F16orSMEF8F16
+    : Predicate<"Subtarget->hasSMEF16F16() || Subtarget->hasSMEF8F16()">,
+                AssemblerPredicateWithAll<(any_of FeatureSMEF16F16, FeatureSMEF8F16),
+                "sme-f16f16 or sme-f8f16">;
+
 // A subset of NEON instructions are legal in Streaming SVE execution mode,
 // they should be enabled if either has been specified.
 def HasNEONorSME
@@ -740,6 +746,8 @@ def AArch64vsli : SDNode<"AArch64ISD::VSLI", SDT_AArch64vshiftinsert>;
 def AArch64vsri : SDNode<"AArch64ISD::VSRI", SDT_AArch64vshiftinsert>;
 
 def AArch64bsp: SDNode<"AArch64ISD::BSP", SDT_AArch64trivec>;
+def AArch64nbsl: PatFrag<(ops node:$Op1, node:$Op2, node:$Op3),
+                         (vnot (AArch64bsp node:$Op1, node:$Op2, node:$Op3))>;
 
 def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>;
 def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>;
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index c50a8200dd89..8772e51bf0ab 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -447,6 +447,15 @@ def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2
                                       FeatureEnableSelectOptimize,
                                       FeaturePredictableSelectIsExpensive]>;
 
+def TuneNeoverseN3 : SubtargetFeature<"neoversen3", "ARMProcFamily", "NeoverseN3",
+                                      "Neoverse N3 ARM processors", [
+                                      FeatureFuseAES,
+                                      FeaturePostRAScheduler,
+                                      FeatureALULSLFast,
+                                      FeatureFuseAdrpAdd,
+                                      FeatureEnableSelectOptimize,
+                                      FeaturePredictableSelectIsExpensive]>;
+
 def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Neoverse512TVB",
                                       "Neoverse 512-TVB ARM processors", [
                                       FeatureFuseAES,
@@ -476,6 +485,24 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
                                       FeatureEnableSelectOptimize,
                                       FeaturePredictableSelectIsExpensive]>;
 
+def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3",
+                                      "Neoverse V3 ARM processors", [
+                                      FeatureFuseAES,
+                                      FeatureALULSLFast,
+                                      FeatureFuseAdrpAdd,
+                                      FeaturePostRAScheduler,
+                                      FeatureEnableSelectOptimize,
+                                      FeaturePredictableSelectIsExpensive]>;
+
+def TuneNeoverseV3AE : SubtargetFeature<"neoversev3AE", "ARMProcFamily", "NeoverseV3",
+                                      "Neoverse V3AE ARM processors", [
+                                      FeatureFuseAES,
+                                      FeatureALULSLFast,
+                                      FeatureFuseAdrpAdd,
+                                      FeaturePostRAScheduler,
+                                      FeatureEnableSelectOptimize,
+                                      FeaturePredictableSelectIsExpensive]>;
+
 def TuneSaphira  : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
                                    "Qualcomm Saphira processors", [
                                    FeaturePostRAScheduler,
@@ -715,6 +742,10 @@ def ProcessorFeatures {
                                        FeatureMatMulInt8, FeatureMTE, FeatureSVE2,
                                        FeatureSVE2BitPerm, FeatureTRBE,
                                        FeaturePerfMon];
+  list<SubtargetFeature> NeoverseN3 = [HasV9_2aOps, FeatureETE, FeatureFP16FML,
+                                      FeatureFullFP16, FeatureMTE, FeaturePerfMon,
+                                      FeatureRandGen, FeatureSPE, FeatureSPE_EEF,
+                                      FeatureSVE2BitPerm];
   list<SubtargetFeature> Neoverse512TVB = [HasV8_4aOps, FeatureBF16, FeatureCacheDeepPersist,
                                            FeatureCrypto, FeatureFPARMv8, FeatureFP16FML,
                                            FeatureFullFP16, FeatureMatMulInt8, FeatureNEON,
@@ -729,6 +760,14 @@ def ProcessorFeatures {
                                        FeaturePerfMon, FeatureETE, FeatureMatMulInt8,
                                        FeatureNEON, FeatureSVE2BitPerm, FeatureFP16FML,
                                        FeatureMTE, FeatureRandGen];
+  list<SubtargetFeature> NeoverseV3 = [HasV9_2aOps, FeatureETE, FeatureFP16FML,
+                                      FeatureFullFP16, FeatureLS64, FeatureMTE,
+                                      FeaturePerfMon, FeatureRandGen, FeatureSPE,
+                                      FeatureSPE_EEF, FeatureSVE2BitPerm, FeatureBRBE];
+  list<SubtargetFeature> NeoverseV3AE = [HasV9_2aOps, FeatureETE, FeatureFP16FML,
+                                      FeatureFullFP16, FeatureLS64, FeatureMTE,
+                                      FeaturePerfMon, FeatureRandGen, FeatureSPE,
+                                      FeatureSPE_EEF, FeatureSVE2BitPerm, FeatureBRBE];
   list<SubtargetFeature> Saphira    = [HasV8_4aOps, FeatureCrypto, FeatureFPARMv8,
                                        FeatureNEON, FeatureSPE, FeaturePerfMon];
   list<SubtargetFeature> ThunderX   = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
@@ -831,12 +870,18 @@ def : ProcessorModel<"neoverse-n1", NeoverseN1Model,
                      ProcessorFeatures.NeoverseN1, [TuneNeoverseN1]>;
 def : ProcessorModel<"neoverse-n2", NeoverseN2Model,
                      ProcessorFeatures.NeoverseN2, [TuneNeoverseN2]>;
+def : ProcessorModel<"neoverse-n3", NeoverseN2Model,
+                     ProcessorFeatures.NeoverseN3, [TuneNeoverseN3]>;
 def : ProcessorModel<"neoverse-512tvb", NeoverseV1Model,
                      ProcessorFeatures.Neoverse512TVB, [TuneNeoverse512TVB]>;
 def : ProcessorModel<"neoverse-v1", NeoverseV1Model,
                      ProcessorFeatures.NeoverseV1, [TuneNeoverseV1]>;
 def : ProcessorModel<"neoverse-v2", NeoverseV2Model,
                      ProcessorFeatures.NeoverseV2, [TuneNeoverseV2]>;
+def : ProcessorModel<"neoverse-v3", NeoverseV2Model,
+                     ProcessorFeatures.NeoverseV3, [TuneNeoverseV3]>;
+def : ProcessorModel<"neoverse-v3ae", NeoverseV2Model,
+                     ProcessorFeatures.NeoverseV3AE, [TuneNeoverseV3AE]>;
 def : ProcessorModel<"exynos-m3", ExynosM3Model, ProcessorFeatures.ExynosM3,
                      [TuneExynosM3]>;
 def : ProcessorModel<"exynos-m4", ExynosM4Model, ProcessorFeatures.ExynosM4,
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 2db0fa253434..574178c8d524 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -792,12 +792,14 @@ defm LUTI4_S_2ZTZI : sme2p1_luti4_vector_vg2_index<"luti4">;
 defm LUTI4_S_4ZTZI : sme2p1_luti4_vector_vg4_index<"luti4">;
 }
 
-let Predicates = [HasSME2p1, HasSMEF16F16] in {
+let Predicates = [HasSMEF16F16orSMEF8F16] in {
 defm FADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fadd", 0b0100, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
 defm FADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fadd", 0b0100, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
 defm FSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fsub", 0b0101, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
 defm FSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fsub", 0b0101, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
+}
 
+let Predicates = [HasSMEF16F16] in {
 defm FMLA_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmla", 0b00, 0b100, ZZ_h_mul_r, ZPR4b16>;
 defm FMLA_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmla", 0b000, ZZZZ_h_mul_r, ZPR4b16>;
 defm FMLA_VG2_M2ZZ_H :  sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0011100, MatrixOp16, ZZ_h, ZPR4b16>;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 525ae79da996..b90ac0ff1fe0 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3760,7 +3760,7 @@ let Predicates = [HasSVE2orSME] in {
   defm BSL_ZZZZ   : sve2_int_bitwise_ternary_op<0b001, "bsl",   int_aarch64_sve_bsl, AArch64bsp>;
   defm BSL1N_ZZZZ : sve2_int_bitwise_ternary_op<0b011, "bsl1n", int_aarch64_sve_bsl1n>;
   defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", int_aarch64_sve_bsl2n>;
-  defm NBSL_ZZZZ  : sve2_int_bitwise_ternary_op<0b111, "nbsl",  int_aarch64_sve_nbsl>;
+  defm NBSL_ZZZZ  : sve2_int_bitwise_ternary_op<0b111, "nbsl",  int_aarch64_sve_nbsl, AArch64nbsl>;
 
   // SVE2 bitwise xor and rotate right by immediate
   defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar", int_aarch64_sve_xar>;
@@ -4119,7 +4119,7 @@ defm BFCLAMP_ZZZ : sve2p1_bfclamp<"bfclamp", AArch64fclamp>;
 // SME2.1 or SVE2.1 instructions
 //===----------------------------------------------------------------------===//
 let Predicates = [HasSVE2p1_or_HasSME2p1] in {
-defm FADDQV   : sve2p1_fp_reduction_q<0b000, "faddqv", int_aarch64_sve_addqv>;
+defm FADDQV   : sve2p1_fp_reduction_q<0b000, "faddqv", int_aarch64_sve_faddqv>;
 defm FMAXNMQV : sve2p1_fp_reduction_q<0b100, "fmaxnmqv", int_aarch64_sve_fmaxnmqv>;
 defm FMINNMQV : sve2p1_fp_reduction_q<0b101, "fminnmqv", int_aarch64_sve_fminnmqv>;
 defm FMAXQV   : sve2p1_fp_reduction_q<0b110, "fmaxqv", int_aarch64_sve_fmaxqv>;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index ef09a3cde495..7d34dd1c7768 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -234,7 +234,9 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
     MaxBytesForLoopAlignment = 16;
     break;
   case NeoverseN2:
+  case NeoverseN3:
   case NeoverseV2:
+  case NeoverseV3:
     PrefFunctionAlignment = Align(16);
     PrefLoopAlignment = Align(32);
     MaxBytesForLoopAlignment = 16;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index f42c415a9e44..243891249668 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -740,6 +740,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .legalForCartesianProduct({s32, v2s16, v4s8})
       .legalForCartesianProduct({s64, v8s8, v4s16, v2s32})
       .legalForCartesianProduct({s128, v16s8, v8s16, v4s32, v2s64, v2p0})
+      .lowerIf([=](const LegalityQuery &Query) {
+        return Query.Types[0].isVector() != Query.Types[1].isVector();
+      })
       .moreElementsToNextPow2(0)
       .clampNumElements(0, v8s8, v16s8)
       .clampNumElements(0, v4s16, v8s16)
@@ -1567,7 +1570,7 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     MI.eraseFromParent();
     return true;
   }
-  case Intrinsic::experimental_vector_reverse:
+  case Intrinsic::vector_reverse:
     // TODO: Add support for vector_reverse
     return false;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index f4a747784d1f..7f4a2437f62e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -148,6 +148,19 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::LOAD, MVT::i128, Promote);
   AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
 
+  // TODO: Would be better to consume as directly legal
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::f32, Promote);
+  AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
+
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::f64, Promote);
+  AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
+
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::f16, Promote);
+  AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
+
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::bf16, Promote);
+  AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
+
   // There are no 64-bit extloads. These should be done as a 32-bit extload and
   // an extension to 64-bit.
   for (MVT VT : MVT::integer_valuetypes())
@@ -300,11 +313,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
   setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
   setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
+  setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
   setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
+  setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
   setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
+  setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
   setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
+  setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
   setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
 
   setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 72661a8d29f8..269c414521db 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -230,6 +230,12 @@ public:
   bool isCheapToSpeculateCtlz(Type *Ty) const override;
 
   bool isSDNodeAlwaysUniform(const SDNode *N) const override;
+
+  // FIXME: This hook should not exist
+  AtomicExpansionKind shouldCastAtomicLoadInIR(LoadInst *LI) const override {
+    return AtomicExpansionKind::None;
+  }
+
   static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
   static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index aa4ec785bf02..56345d14a331 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2261,7 +2261,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
   case AMDGPU::G_FCMP:
     if (!Subtarget.hasSALUFloatInsts())
       break;
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   case AMDGPU::G_ICMP:
   case AMDGPU::G_UADDO:
   case AMDGPU::G_USUBO:
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 386672352114..510f5bbf2555 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -172,6 +172,7 @@ public:
     ImmTyWaitEXP,
     ImmTyWaitVAVDst,
     ImmTyWaitVMVSrc,
+    ImmTyByteSel,
   };
 
   // Immediate operand kind.
@@ -384,8 +385,8 @@ public:
   bool isIdxen() const { return isImmTy(ImmTyIdxen); }
   bool isAddr64() const { return isImmTy(ImmTyAddr64); }
   bool isOffset() const { return isImmTy(ImmTyOffset); }
-  bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<8>(getImm()); }
-  bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); }
+  bool isOffset0() const { return isImmTy(ImmTyOffset0); }
+  bool isOffset1() const { return isImmTy(ImmTyOffset1); }
   bool isSMEMOffsetMod() const { return isImmTy(ImmTySMEMOffsetMod); }
   bool isFlatOffset() const { return isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset); }
   bool isGDS() const { return isImmTy(ImmTyGDS); }
@@ -410,6 +411,7 @@ public:
   bool isOpSelHi() const { return isImmTy(ImmTyOpSelHi); }
   bool isNegLo() const { return isImmTy(ImmTyNegLo); }
   bool isNegHi() const { return isImmTy(ImmTyNegHi); }
+  bool isByteSel() const { return isImmTy(ImmTyByteSel); }
 
   bool isRegOrImm() const {
     return isReg() || isImm();
@@ -1139,6 +1141,7 @@ public:
     case ImmTyWaitEXP: OS << "WaitEXP"; break;
     case ImmTyWaitVAVDst: OS << "WaitVAVDst"; break;
     case ImmTyWaitVMVSrc: OS << "WaitVMVSrc"; break;
+    case ImmTyByteSel: OS << "ByteSel" ; break;
     }
     // clang-format on
   }
@@ -8644,6 +8647,13 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
     }
   }
 
+  if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel)) {
+    assert(AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in));
+    Inst.addOperand(Inst.getOperand(0));
+    addOptionalImmOperand(Inst, Operands, OptionalIdx,
+                          AMDGPUOperand::ImmTyByteSel);
+  }
+
   if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
     addOptionalImmOperand(Inst, Operands, OptionalIdx,
                           AMDGPUOperand::ImmTyClampSI);
@@ -8680,8 +8690,8 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
 
   if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi ||
       Opc == AMDGPU::V_CVT_SR_FP8_F32_vi ||
-      Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_gfx12 ||
-      Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_gfx12) {
+      Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_gfx12 ||
+      Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_gfx12) {
     Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods
     Inst.addOperand(Inst.getOperand(0));
   }
@@ -8692,7 +8702,11 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
       !(Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp_gfx12 ||
         Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp_gfx12 ||
         Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp8_gfx12 ||
-        Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12)) {
+        Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12 ||
+        Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 ||
+        Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 ||
+        Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 ||
+        Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12)) {
     assert(!IsPacked);
     Inst.addOperand(Inst.getOperand(0));
   }
@@ -8923,11 +8937,11 @@ bool AMDGPUOperand::isBLGP() const {
 }
 
 bool AMDGPUOperand::isCBSZ() const {
-  return isImm() && getImmTy() == ImmTyCBSZ && isUInt<3>(getImm());
+  return isImm() && getImmTy() == ImmTyCBSZ;
 }
 
 bool AMDGPUOperand::isABID() const {
-  return isImm() && getImmTy() == ImmTyABID && isUInt<4>(getImm());
+  return isImm() && getImmTy() == ImmTyABID;
 }
 
 bool AMDGPUOperand::isS16Imm() const {
@@ -9207,10 +9221,11 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
       Inst.addOperand(Inst.getOperand(0));
     }
 
-    bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
-                          Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12 ||
-                          Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
-                          Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12;
+    bool IsVOP3CvtSrDpp =
+        Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12 ||
+        Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 ||
+        Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 ||
+        Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12;
     if (IsVOP3CvtSrDpp) {
       if (Src2ModIdx == static_cast<int>(Inst.getNumOperands())) {
         Inst.addOperand(MCOperand::createImm(0));
@@ -9243,6 +9258,11 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
       llvm_unreachable("unhandled operand type");
     }
   }
+
+  if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel))
+    addOptionalImmOperand(Inst, Operands, OptionalIdx,
+                          AMDGPUOperand::ImmTyByteSel);
+
   if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
 
@@ -9648,25 +9668,17 @@ bool AMDGPUOperand::isEndpgm() const { return isImmTy(ImmTyEndpgm); }
 // LDSDIR
 //===----------------------------------------------------------------------===//
 
-bool AMDGPUOperand::isWaitVDST() const {
-  return isImmTy(ImmTyWaitVDST) && isUInt<4>(getImm());
-}
+bool AMDGPUOperand::isWaitVDST() const { return isImmTy(ImmTyWaitVDST); }
 
-bool AMDGPUOperand::isWaitVAVDst() const {
-  return isImmTy(ImmTyWaitVAVDst) && isUInt<4>(getImm());
-}
+bool AMDGPUOperand::isWaitVAVDst() const { return isImmTy(ImmTyWaitVAVDst); }
 
-bool AMDGPUOperand::isWaitVMVSrc() const {
-  return isImmTy(ImmTyWaitVMVSrc) && isUInt<1>(getImm());
-}
+bool AMDGPUOperand::isWaitVMVSrc() const { return isImmTy(ImmTyWaitVMVSrc); }
 
 //===----------------------------------------------------------------------===//
 // VINTERP
 //===----------------------------------------------------------------------===//
 
-bool AMDGPUOperand::isWaitEXP() const {
-  return isImmTy(ImmTyWaitEXP) && isUInt<3>(getImm());
-}
+bool AMDGPUOperand::isWaitEXP() const { return isImmTy(ImmTyWaitEXP); }
 
 //===----------------------------------------------------------------------===//
 // Split Barrier
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index dc1bf92771b4..8fd36b84a00c 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -869,10 +869,6 @@ void AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
   if (VDstInIdx != -1)
     insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
 
-  if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
-      MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12)
-    insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
-
   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
   if (MI.getNumOperands() < DescNumOps &&
       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
@@ -902,10 +898,6 @@ void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
   if (VDstInIdx != -1)
     insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
 
-  if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
-      MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12)
-    insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
-
   unsigned Opc = MI.getOpcode();
   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
   if (MI.getNumOperands() < DescNumOps &&
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 5090b0a07da4..91733c2933b4 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -409,6 +409,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
       if (NegHiOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::neg_hi)) {
         DPPInst.addImm(NegHiOpr->getImm());
       }
+      auto *ByteSelOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::byte_sel);
+      if (ByteSelOpr &&
+          AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::byte_sel)) {
+        DPPInst.addImm(ByteSelOpr->getImm());
+      }
     }
     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index b6a95906bc45..883b6c4407fe 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1806,4 +1806,14 @@ void AMDGPUInstPrinter::printEndpgm(const MCInst *MI, unsigned OpNo,
   O << ' ' << formatDec(Imm);
 }
 
+void AMDGPUInstPrinter::printByteSel(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  uint8_t Imm = MI->getOperand(OpNo).getImm();
+  if (!Imm)
+    return;
+
+  O << " byte_sel:" << formatDec(Imm);
+}
+
 #include "AMDGPUGenAsmWriter.inc"
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index c801eaf1111e..d6d7fd34b68c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -186,6 +186,8 @@ private:
                     const MCSubtargetInfo &STI, raw_ostream &O);
   void printExpTgt(const MCInst *MI, unsigned OpNo,
                    const MCSubtargetInfo &STI, raw_ostream &O);
+  void printByteSel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O);
 
 public:
   static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e20fe1b716b6..76b90042d65f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -461,8 +461,10 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
     return true;
   }
 
-  if (isMIMG(LdSt)) {
-    int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
+  if (isImage(LdSt)) {
+    auto RsrcOpName =
+        isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
+    int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
     BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
     if (VAddr0Idx >= 0) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index f1afbcc060b2..7189e6e40506 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1000,8 +1000,10 @@ def SDWAVopcDst : BoolRC {
 }
 
 class NamedIntOperand<ValueType Type, string Prefix, bit Optional = 1,
-                      string name = NAME, string ConvertMethod = "nullptr">
+                      string name = NAME>
     : CustomOperand<Type, Optional, name> {
+  string Validator = "[](int64_t V) { return true; }";
+  string ConvertMethod = "[](int64_t &V) { return "#Validator#"(V); }";
   let ParserMethod =
     "[this](OperandVector &Operands) -> ParseStatus { "#
     "return parseIntWithPrefix(\""#Prefix#"\", Operands, "#
@@ -1045,8 +1047,10 @@ class ArrayOperand0<string Id, string Name = NAME>
 let ImmTy = "ImmTyOffset" in
 def flat_offset : CustomOperand<i32, 1, "FlatOffset">;
 def Offset : NamedIntOperand<i32, "offset">;
+let Validator = "isUInt<8>" in {
 def Offset0 : NamedIntOperand<i8, "offset0">;
 def Offset1 : NamedIntOperand<i8, "offset1">;
+}
 
 def gds : NamedBitOperand<"gds", "GDS">;
 
@@ -1103,25 +1107,41 @@ let DefaultValue = "0xf" in {
 def DppRowMask : NamedIntOperand<i32, "row_mask">;
 def DppBankMask : NamedIntOperand<i32, "bank_mask">;
 }
-def DppBoundCtrl : NamedIntOperand<i1, "bound_ctrl", 1, "DppBoundCtrl",
-    "[this] (int64_t &BC) -> bool { return convertDppBoundCtrl(BC); }">;
+def DppBoundCtrl : NamedIntOperand<i1, "bound_ctrl"> {
+  let ConvertMethod = "[this] (int64_t &BC) -> bool { return convertDppBoundCtrl(BC); }";
+}
 
 let DecoderMethod = "decodeDpp8FI" in
 def Dpp8FI : NamedIntOperand<i32, "fi", 1, "DppFI">;
 def Dpp16FI : NamedIntOperand<i32, "fi", 1, "DppFI">;
 
 def blgp : CustomOperand<i32, 1, "BLGP">;
-def CBSZ : NamedIntOperand<i32, "cbsz">;
-def ABID : NamedIntOperand<i32, "abid">;
-
+def CBSZ : NamedIntOperand<i32, "cbsz"> {
+  let Validator = "isUInt<3>";
+}
+def ABID : NamedIntOperand<i32, "abid"> {
+  let Validator = "isUInt<4>";
+}
 def hwreg : CustomOperand<i32, 0, "Hwreg">;
 
 def exp_tgt : CustomOperand<i32, 0, "ExpTgt">;
 
-def WaitVDST : NamedIntOperand<i8, "wait_vdst">;
-def WaitEXP : NamedIntOperand<i8, "wait_exp">;
-def WaitVAVDst : NamedIntOperand<i8, "wait_va_vdst">;
-def WaitVMVSrc : NamedIntOperand<i8, "wait_vm_vsrc">;
+def WaitVDST : NamedIntOperand<i8, "wait_vdst"> {
+  let Validator = "isUInt<4>";
+}
+def WaitEXP : NamedIntOperand<i8, "wait_exp"> {
+  let Validator = "isUInt<3>";
+}
+def WaitVAVDst : NamedIntOperand<i8, "wait_va_vdst"> {
+  let Validator = "isUInt<4>";
+}
+def WaitVMVSrc : NamedIntOperand<i8, "wait_vm_vsrc"> {
+  let Validator = "isUInt<1>";
+}
+
+def ByteSel : NamedIntOperand<i8, "byte_sel"> {
+  let Validator = "isUInt<2>";
+}
 
 class KImmFPOperand<ValueType vt> : ImmOperand<vt> {
   let OperandNamespace = "AMDGPU";
@@ -1700,9 +1720,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
           (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                Src1Mod:$src1_modifiers, Src1RC:$src1,
                clampmod0:$clamp, omod0:$omod),
-           (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-               Src1Mod:$src1_modifiers, Src1RC:$src1,
-               clampmod0:$clamp))
+          !con((ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+                    Src1Mod:$src1_modifiers, Src1RC:$src1),
+                !if(HasClamp, (ins clampmod0:$clamp), (ins))))
       /* else */,
         // VOP2 without modifiers
         !if (HasClamp,
@@ -2036,7 +2056,8 @@ class getAsmDPP8 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT
 class getAsmVOP3Base <int NumSrcArgs, bit HasDst, bit HasClamp,
                        bit HasOpSel, bit HasOMod, bit IsVOP3P,
                        bit HasModifiers, bit Src0HasMods,
-                       bit Src1HasMods, bit Src2HasMods, ValueType DstVT = i32> {
+                       bit Src1HasMods, bit Src2HasMods, ValueType DstVT = i32,
+                       bit HasByteSel = 0> {
   string dst = !if(HasDst,
                    !if(!eq(DstVT.Size, 1),
                        "$sdst",
@@ -2058,6 +2079,7 @@ class getAsmVOP3Base <int NumSrcArgs, bit HasDst, bit HasClamp,
   string src1 = !if(Src1HasMods, src1mods, src1nomods);
   string src2 = !if(Src2HasMods, src2mods, src2nomods);
   string opsel = !if(HasOpSel, "$op_sel", "");
+  string bytesel = !if(HasByteSel, "$byte_sel", "");
   string 3PMods = !if(IsVOP3P,
                       !if(HasOpSel, "$op_sel_hi", "")
                         #!if(HasModifiers, "$neg_lo$neg_hi", ""),
@@ -2065,7 +2087,7 @@ class getAsmVOP3Base <int NumSrcArgs, bit HasDst, bit HasClamp,
   string clamp = !if(HasClamp, "$clamp", "");
   string omod = !if(HasOMod, "$omod", "");
 
-  string ret = dst#!if(!gt(NumSrcArgs,0),", "#src0#src1#src2#opsel#3PMods#clamp#omod, "");
+  string ret = dst#!if(!gt(NumSrcArgs,0),", "#src0#src1#src2#opsel#bytesel#3PMods#clamp#omod, "");
 
 }
 
@@ -2282,6 +2304,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field bit IsSWMMAC = 0;
 
   field bit IsFP8 = 0;
+  field bit IsFP8DstByteSel = 0;
 
   field bit HasDst = !ne(DstVT.Value, untyped.Value);
   field bit HasDst32 = HasDst;
@@ -2401,7 +2424,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
   field string AsmVOP3Base = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
    HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, HasModifiers,
-   HasModifiers, DstVT>.ret;
+   HasModifiers, DstVT, IsFP8DstByteSel>.ret;
   field string Asm64 = AsmVOP3Base;
   field string AsmVOP3P = getAsmVOP3P<NumSrcArgs, HasModifiers, HasClamp, HasOpSel>.ret;
   field string AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs,
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index bf4a501cc315..072c5aedc220 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -110,7 +110,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   }
 
   if (!AMDGPU::isGraphics(CC) ||
-      ((CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_CS) &&
+      ((CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_Gfx) &&
        ST.hasArchitectedSGPRs())) {
     if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x"))
       WorkGroupIDX = true;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 647595d9ccab..616bc7684753 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -580,6 +580,22 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
                                           HasSrc2FloatMods>.ret>.ret);
 }
 
+class VOP3_CVT_SR_F8_ByteSel_Profile<ValueType SrcVT> :
+  VOP3_Profile<VOPProfile<[i32, SrcVT, i32, untyped]>> {
+  let IsFP8DstByteSel = 1;
+  let HasClamp = 0;
+  defvar bytesel = (ins VGPR_32:$vdst_in, ByteSel:$byte_sel);
+  let Ins64 = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
+                            HasClamp, HasModifiers, HasSrc2Mods,
+                            HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
+                   bytesel);
+  let InsVOP3Base = !con(
+    getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP,
+                   Src2VOP3DPP, NumSrcArgs, HasClamp, HasModifiers, HasSrc2Mods, HasOMod,
+                   Src0ModVOP3DPP, Src1ModVOP3DPP, Src2ModVOP3DPP, HasOpSel>.ret,
+    bytesel);
+}
+
 def IsPow2Plus1: PatLeaf<(i32 imm), [{
   uint32_t V = N->getZExtValue();
   return isPowerOf2_32(V - 1);
@@ -645,12 +661,17 @@ let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0,
   let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
     defm V_CVT_PK_FP8_F32 : VOP3Inst<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile>;
     defm V_CVT_PK_BF8_F32 : VOP3Inst<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile>;
+
+    let SubtargetPredicate = isGFX12Plus in {
+      defm V_CVT_SR_FP8_F32_gfx12 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
+      defm V_CVT_SR_BF8_F32_gfx12 : VOP3Inst<"v_cvt_sr_bf8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
+    }
   }
 
   // These instructions have non-standard use of op_sel. In particular they are
   // using op_sel bits 2 and 3 while only having two sources. Therefore dummy
   // src2 is used to hold the op_sel value.
-  let Constraints = "$vdst = $src2", DisableEncoding = "$src2" in {
+  let Constraints = "$vdst = $src2", DisableEncoding = "$src2", SubtargetPredicate = isGFX940Plus in {
     defm V_CVT_SR_FP8_F32 : VOP3Inst<"v_cvt_sr_fp8_f32", VOP3_CVT_SR_F8_F32_Profile>;
     defm V_CVT_SR_BF8_F32 : VOP3Inst<"v_cvt_sr_bf8_f32", VOP3_CVT_SR_F8_F32_Profile>;
   }
@@ -667,15 +688,28 @@ class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst>
           !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, 0)
 >;
 
+class Cvt_SR_F8_ByteSel_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType SrcVT> : GCNPat<
+    (i32 (node (VOP3Mods SrcVT:$src0, i32:$src0_modifiers), (VOP3Mods i32:$src1, i32:$src1_modifiers),
+          i32:$old, timm:$byte_sel)),
+    (inst $src0_modifiers, $src0, $src1_modifiers, $src1, $old, (as_i32timm $byte_sel))
+>;
+
 let OtherPredicates = [HasFP8ConversionInsts] in {
 foreach Index = [0, -1] in {
   def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>;
   def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_e64>;
 }
 
-foreach Index = [0, 1, 2, 3] in {
-  def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_fp8_f32, Index, V_CVT_SR_FP8_F32_e64>;
-  def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_bf8_f32, Index, V_CVT_SR_BF8_F32_e64>;
+let SubtargetPredicate = isGFX940Plus in {
+  foreach Index = [0, 1, 2, 3] in {
+    def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_fp8_f32, Index, V_CVT_SR_FP8_F32_e64>;
+    def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_bf8_f32, Index, V_CVT_SR_BF8_F32_e64>;
+  }
+}
+
+let SubtargetPredicate = isGFX12Plus in {
+  def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f32, V_CVT_SR_FP8_F32_gfx12_e64, f32>;
+  def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f32, V_CVT_SR_BF8_F32_gfx12_e64, f32>;
 }
 }
 
@@ -1040,8 +1074,8 @@ defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;
 
 defm V_CVT_PK_FP8_F32  : VOP3Only_Realtriple_gfx12<0x369>;
 defm V_CVT_PK_BF8_F32  : VOP3Only_Realtriple_gfx12<0x36a>;
-defm V_CVT_SR_FP8_F32  : VOP3Only_Realtriple_gfx12<0x36b>;
-defm V_CVT_SR_BF8_F32  : VOP3Only_Realtriple_gfx12<0x36c>;
+defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >;
+defm V_CVT_SR_BF8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36c, "V_CVT_SR_BF8_F32_gfx12", "v_cvt_sr_bf8_f32">;
 
 //===----------------------------------------------------------------------===//
 // GFX11, GFX12
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index da16178cb58b..7cdb5cbfe297 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -311,6 +311,14 @@ class VOP3FP8OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
   let Inst{12} = !if(p.HasSrc0, src0_modifiers{3}, 0);
 }
 
+ class VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
+   bits<2> byte_sel;
+
+   let Inst{11} = 0; // op_sel0
+   let Inst{12} = 0; // op_sel1
+   let Inst{14-13} = byte_sel;  // op_sel2/3
+ }
+
 class VOP3DotOpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx11_gfx12<op, p>{
   let Inst{11} = ?;
   let Inst{12} = ?;
@@ -741,6 +749,7 @@ class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
   bits<3> src2_modifiers;
   bits<1> clamp;
   bits<2> omod;
+  bits<2> byte_sel;
 
   let Inst{8}     = !if(P.HasSrc0Mods, src0_modifiers{1}, 0);
   let Inst{9}     = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
@@ -748,8 +757,8 @@ class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
   // OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs.
   let Inst{11} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{2}, 0),?);
   let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, !if((P.IsFP8), src0_modifiers{3}, 0)), ?);
-  let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),?);
-  let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),?);
+  let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),!if(P.IsFP8DstByteSel, byte_sel{0}, ?));
+  let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),!if(P.IsFP8DstByteSel, byte_sel{1}, ?));
   let Inst{15}    = !if(P.HasClamp, clamp, 0);
   let Inst{25-16} = op;
   let Inst{31-26} = 0x35;
@@ -1388,7 +1397,11 @@ multiclass VOP3_Real_Base<GFXGen Gen, bits<10> op, string opName = NAME,
                           bit isSingle = 0> {
   defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
   let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
-    if ps.Pfl.HasOpSel then {
+    if ps.Pfl.IsFP8DstByteSel then {
+      def _e64#Gen.Suffix :
+        VOP3_Real_Gen<ps, Gen>,
+        VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<op, ps.Pfl>;
+    } if ps.Pfl.HasOpSel then {
       def _e64#Gen.Suffix :
         VOP3_Real_Gen<ps, Gen>,
         VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
@@ -1419,6 +1432,10 @@ multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
       def _e64#Gen.Suffix :
         VOP3_Real_Gen<ps, Gen>,
         VOP3FP8OpSel_gfx11_gfx12<op, ps.Pfl>;
+    } else if ps.Pfl.IsFP8DstByteSel then {
+      def _e64#Gen.Suffix :
+        VOP3_Real_Gen<ps, Gen>,
+        VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<op, ps.Pfl>;
     } else if ps.Pfl.HasOpSel then {
       def _e64#Gen.Suffix :
         VOP3_Real_Gen<ps, Gen>,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index d0e9f61c0bd1..f67a68acbf23 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1555,15 +1555,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 
   if (Subtarget->hasNEON()) {
     // vmin and vmax aren't available in a scalar form, so we can use
-    // a NEON instruction with an undef lane instead.  This has a performance
-    // penalty on some cores, so we don't do this unless we have been
-    // asked to by the core tuning model.
-    if (Subtarget->useNEONForSinglePrecisionFP()) {
-      setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
-      setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
-      setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
-      setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
-    }
+    // a NEON instruction with an undef lane instead.
+    setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
     setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
     setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
     setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 028db9d17e30..e54314cc7d00 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -11104,7 +11104,7 @@ ARMAsmParser::checkEarlyTargetMatchPredicate(MCInst &Inst,
       return Match_MnemonicFail;
     }
   }
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   default:
     return Match_Success;
   }
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index ebd8447eba85..8c9f5c4dc554 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -973,8 +973,7 @@ void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) {
 }
 
 /// Read file contents from the actual file or from the source
-std::string BTFDebug::populateFileContent(const DISubprogram *SP) {
-  auto File = SP->getFile();
+std::string BTFDebug::populateFileContent(const DIFile *File) {
   std::string FileName;
 
   if (!File->getFilename().starts_with("/") && File->getDirectory().size())
@@ -1005,9 +1004,9 @@ std::string BTFDebug::populateFileContent(const DISubprogram *SP) {
   return FileName;
 }
 
-void BTFDebug::constructLineInfo(const DISubprogram *SP, MCSymbol *Label,
+void BTFDebug::constructLineInfo(MCSymbol *Label, const DIFile *File,
                                  uint32_t Line, uint32_t Column) {
-  std::string FileName = populateFileContent(SP);
+  std::string FileName = populateFileContent(File);
   BTFLineInfo LineInfo;
 
   LineInfo.Label = Label;
@@ -1366,10 +1365,10 @@ void BTFDebug::beginInstruction(const MachineInstr *MI) {
   if (!CurMI) // no debug info
     return;
 
-  // Skip this instruction if no DebugLoc or the DebugLoc
-  // is the same as the previous instruction.
+  // Skip this instruction if no DebugLoc, the DebugLoc
+  // is the same as the previous instruction or Line is 0.
   const DebugLoc &DL = MI->getDebugLoc();
-  if (!DL || PrevInstLoc == DL) {
+  if (!DL || PrevInstLoc == DL || DL.getLine() == 0) {
     // This instruction will be skipped, no LineInfo has
     // been generated, construct one based on function signature.
     if (LineInfoGenerated == false) {
@@ -1377,7 +1376,7 @@ void BTFDebug::beginInstruction(const MachineInstr *MI) {
       if (!S)
         return;
       MCSymbol *FuncLabel = Asm->getFunctionBegin();
-      constructLineInfo(S, FuncLabel, S->getLine(), 0);
+      constructLineInfo(FuncLabel, S->getFile(), S->getLine(), 0);
       LineInfoGenerated = true;
     }
 
@@ -1389,8 +1388,7 @@ void BTFDebug::beginInstruction(const MachineInstr *MI) {
   OS.emitLabel(LineSym);
 
   // Construct the lineinfo.
-  auto SP = DL->getScope()->getSubprogram();
-  constructLineInfo(SP, LineSym, DL.getLine(), DL.getCol());
+  constructLineInfo(LineSym, DL->getFile(), DL.getLine(), DL.getCol());
 
   LineInfoGenerated = true;
   PrevInstLoc = DL;
diff --git a/llvm/lib/Target/BPF/BTFDebug.h b/llvm/lib/Target/BPF/BTFDebug.h
index 7536006ed21c..11a0c59ba6c9 100644
--- a/llvm/lib/Target/BPF/BTFDebug.h
+++ b/llvm/lib/Target/BPF/BTFDebug.h
@@ -343,10 +343,10 @@ class BTFDebug : public DebugHandlerBase {
 
   /// Get the file content for the subprogram. Certain lines of the file
   /// later may be put into string table and referenced by line info.
-  std::string populateFileContent(const DISubprogram *SP);
+  std::string populateFileContent(const DIFile *File);
 
   /// Construct a line info.
-  void constructLineInfo(const DISubprogram *SP, MCSymbol *Label, uint32_t Line,
+  void constructLineInfo(MCSymbol *Label, const DIFile *File, uint32_t Line,
                          uint32_t Column);
 
   /// Generate types and variables for globals.
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index 4d99bc006900..4b162a35365c 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -77,8 +77,8 @@ static bool expandIntegerDot(CallInst *Orig, Intrinsic::ID DotIntrinsic) {
                                    : Intrinsic::dx_umad;
   Value *A = Orig->getOperand(0);
   Value *B = Orig->getOperand(1);
-  Type *ATy = A->getType();
-  Type *BTy = B->getType();
+  [[maybe_unused]] Type *ATy = A->getType();
+  [[maybe_unused]] Type *BTy = B->getType();
   assert(ATy->isVectorTy() && BTy->isVectorTy());
 
   IRBuilder<> Builder(Orig->getParent());
diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt
index cdc062eee72b..9e4ca08aea40 100644
--- a/llvm/lib/Target/Hexagon/CMakeLists.txt
+++ b/llvm/lib/Target/Hexagon/CMakeLists.txt
@@ -26,6 +26,7 @@ add_llvm_target(HexagonCodeGen
   HexagonCommonGEP.cpp
   HexagonConstExtenders.cpp
   HexagonConstPropagation.cpp
+  HexagonCopyHoisting.cpp
   HexagonCopyToCombine.cpp
   HexagonEarlyIfConv.cpp
   HexagonExpandCondsets.cpp
diff --git a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp
new file mode 100644
index 000000000000..97917270601b
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp
@@ -0,0 +1,272 @@
+//===--------- HexagonCopyHoisting.cpp - Hexagon Copy Hoisting  ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// The purpose of this pass is to move the copy instructions that are
+// present in all the successor of a basic block (BB) to the end of BB.
+//===----------------------------------------------------------------------===//
+
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "CopyHoist"
+
+using namespace llvm;
+
+static cl::opt<std::string> CPHoistFn("cphoistfn", cl::Hidden, cl::desc(""),
+                                      cl::init(""));
+
+namespace llvm {
+void initializeHexagonCopyHoistingPass(PassRegistry &Registry);
+FunctionPass *createHexagonCopyHoisting();
+} // namespace llvm
+
+namespace {
+
+class HexagonCopyHoisting : public MachineFunctionPass {
+
+public:
+  static char ID;
+  HexagonCopyHoisting() : MachineFunctionPass(ID), MFN(nullptr), MRI(nullptr) {
+    initializeHexagonCopyHoistingPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "Hexagon Copy Hoisting"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<SlotIndexes>();
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+  void collectCopyInst();
+  void addMItoCopyList(MachineInstr *MI);
+  bool analyzeCopy(MachineBasicBlock *BB);
+  bool isSafetoMove(MachineInstr *CandMI);
+  void moveCopyInstr(MachineBasicBlock *DestBB,
+                     std::pair<Register, Register> Key, MachineInstr *MI);
+
+  MachineFunction *MFN;
+  MachineRegisterInfo *MRI;
+  std::vector<DenseMap<std::pair<Register, Register>, MachineInstr *>>
+      CopyMIList;
+};
+
+} // namespace
+
+char HexagonCopyHoisting::ID = 0;
+
+namespace llvm {
+char &HexagonCopyHoistingID = HexagonCopyHoisting::ID;
+} // namespace llvm
+
+bool HexagonCopyHoisting::runOnMachineFunction(MachineFunction &Fn) {
+
+  if ((CPHoistFn != "") && (CPHoistFn != Fn.getFunction().getName()))
+    return false;
+
+  MFN = &Fn;
+  MRI = &Fn.getRegInfo();
+
+  LLVM_DEBUG(dbgs() << "\nCopy Hoisting:" << "\'" << Fn.getName() << "\'\n");
+
+  CopyMIList.clear();
+  CopyMIList.resize(Fn.getNumBlockIDs());
+
+  // Traverse through all basic blocks and collect copy instructions.
+  collectCopyInst();
+
+  // Traverse through the basic blocks again and move the COPY instructions
+  // that are present in all the successors of BB to BB.
+  bool Changed = false;
+  for (MachineBasicBlock *BB : post_order(&Fn)) {
+    if (!BB->empty()) {
+      if (BB->pred_size() != 1)
+        continue;
+      auto &BBCopyInst = CopyMIList[BB->getNumber()];
+      if (BBCopyInst.size() > 0)
+        Changed |= analyzeCopy(*BB->pred_begin());
+    }
+  }
+  // Re-compute liveness
+  if (Changed) {
+    LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+    SlotIndexes *SI = LIS.getSlotIndexes();
+    SI->releaseMemory();
+    SI->runOnMachineFunction(Fn);
+    LIS.releaseMemory();
+    LIS.runOnMachineFunction(Fn);
+  }
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Save all COPY instructions for each basic block in CopyMIList vector.
+//===----------------------------------------------------------------------===//
+void HexagonCopyHoisting::collectCopyInst() {
+  for (MachineBasicBlock &BB : *MFN) {
+#ifndef NDEBUG
+    auto &BBCopyInst = CopyMIList[BB.getNumber()];
+    LLVM_DEBUG(dbgs() << "Visiting BB#" << BB.getNumber() << ":\n");
+#endif
+
+    for (MachineInstr &MI : BB) {
+      if (MI.getOpcode() == TargetOpcode::COPY)
+        addMItoCopyList(&MI);
+    }
+    LLVM_DEBUG(dbgs() << "\tNumber of copies: " << BBCopyInst.size() << "\n");
+  }
+}
+
+void HexagonCopyHoisting::addMItoCopyList(MachineInstr *MI) {
+  unsigned BBNum = MI->getParent()->getNumber();
+  auto &BBCopyInst = CopyMIList[BBNum];
+  Register DstReg = MI->getOperand(0).getReg();
+  Register SrcReg = MI->getOperand(1).getReg();
+
+  if (!Register::isVirtualRegister(DstReg) ||
+      !Register::isVirtualRegister(SrcReg) ||
+      MRI->getRegClass(DstReg) != &Hexagon::IntRegsRegClass ||
+      MRI->getRegClass(SrcReg) != &Hexagon::IntRegsRegClass)
+    return;
+
+  BBCopyInst.insert(std::pair(std::pair(SrcReg, DstReg), MI));
+#ifndef NDEBUG
+  LLVM_DEBUG(dbgs() << "\tAdding Copy Instr to the list: " << MI << "\n");
+  for (auto II : BBCopyInst) {
+    MachineInstr *TempMI = II.getSecond();
+    LLVM_DEBUG(dbgs() << "\tIn the list: " << TempMI << "\n");
+  }
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+// Look at the COPY instructions of all the successors of BB. If the same
+// instruction is present in every successor and can be safely moved,
+// pull it into BB.
+//===----------------------------------------------------------------------===//
+bool HexagonCopyHoisting::analyzeCopy(MachineBasicBlock *BB) {
+
+  bool Changed = false;
+  if (BB->succ_size() < 2)
+    return false;
+
+  for (MachineBasicBlock *SB : BB->successors()) {
+    if (SB->pred_size() != 1 || SB->isEHPad() || SB->hasAddressTaken())
+      return false;
+  }
+
+  MachineBasicBlock *SBB1 = *BB->succ_begin();
+  auto &BBCopyInst1 = CopyMIList[SBB1->getNumber()];
+
+  for (auto II : BBCopyInst1) {
+    std::pair<Register, Register> Key = II.getFirst();
+    MachineInstr *MI = II.getSecond();
+    bool IsSafetoMove = true;
+    for (MachineBasicBlock *SuccBB : BB->successors()) {
+      auto &SuccBBCopyInst = CopyMIList[SuccBB->getNumber()];
+      if (!SuccBBCopyInst.count(Key)) {
+        // Same copy not present in this successor
+        IsSafetoMove = false;
+        break;
+      }
+      // If present, make sure that it's safe to pull this copy instruction
+      // into the predecessor.
+      MachineInstr *SuccMI = SuccBBCopyInst[Key];
+      if (!isSafetoMove(SuccMI)) {
+        IsSafetoMove = false;
+        break;
+      }
+    }
+    // If we have come this far, this copy instruction can be safely
+    // moved to the predecessor basic block.
+    if (IsSafetoMove) {
+      LLVM_DEBUG(dbgs() << "\t\t Moving instr to BB#" << BB->getNumber() << ": "
+                        << MI << "\n");
+      moveCopyInstr(BB, Key, MI);
+      // Add my into BB copyMI list.
+      Changed = true;
+    }
+  }
+
+#ifndef NDEBUG
+  auto &BBCopyInst = CopyMIList[BB->getNumber()];
+  for (auto II : BBCopyInst) {
+    MachineInstr *TempMI = II.getSecond();
+    LLVM_DEBUG(dbgs() << "\tIn the list: " << TempMI << "\n");
+  }
+#endif
+  return Changed;
+}
+
+bool HexagonCopyHoisting::isSafetoMove(MachineInstr *CandMI) {
+  // Make sure that it's safe to move this 'copy' instruction to the predecessor
+  // basic block.
+  assert(CandMI->getOperand(0).isReg() && CandMI->getOperand(1).isReg());
+  Register DefR = CandMI->getOperand(0).getReg();
+  Register UseR = CandMI->getOperand(1).getReg();
+
+  MachineBasicBlock *BB = CandMI->getParent();
+  // There should not be a def/use of DefR between the start of BB and CandMI.
+  MachineBasicBlock::iterator MII, MIE;
+  for (MII = BB->begin(), MIE = CandMI; MII != MIE; ++MII) {
+    MachineInstr *OtherMI = &*MII;
+    for (const MachineOperand &Mo : OtherMI->operands())
+      if (Mo.isReg() && Mo.getReg() == DefR)
+        return false;
+  }
+  // There should not be a def of UseR between the start of BB and CandMI.
+  for (MII = BB->begin(), MIE = CandMI; MII != MIE; ++MII) {
+    MachineInstr *OtherMI = &*MII;
+    for (const MachineOperand &Mo : OtherMI->operands())
+      if (Mo.isReg() && Mo.isDef() && Mo.getReg() == UseR)
+        return false;
+  }
+  return true;
+}
+
+void HexagonCopyHoisting::moveCopyInstr(MachineBasicBlock *DestBB,
+                                        std::pair<Register, Register> Key,
+                                        MachineInstr *MI) {
+  MachineBasicBlock::iterator FirstTI = DestBB->getFirstTerminator();
+  assert(FirstTI != DestBB->end());
+
+  DestBB->splice(FirstTI, MI->getParent(), MI);
+
+  addMItoCopyList(MI);
+  for (auto I = ++(DestBB->succ_begin()), E = DestBB->succ_end(); I != E; ++I) {
+    MachineBasicBlock *SuccBB = *I;
+    auto &BBCopyInst = CopyMIList[SuccBB->getNumber()];
+    MachineInstr *SuccMI = BBCopyInst[Key];
+    SuccMI->eraseFromParent();
+    BBCopyInst.erase(Key);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+INITIALIZE_PASS(HexagonCopyHoisting, "hexagon-move-phicopy",
+                "Hexagon move phi copy", false, false)
+
+FunctionPass *llvm::createHexagonCopyHoisting() {
+  return new HexagonCopyHoisting();
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index e64d7e52a9aa..3a792ecfd03d 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -43,8 +43,9 @@ cl::opt<unsigned> RDFFuncBlockLimit(
     "rdf-bb-limit", cl::Hidden, cl::init(1000),
     cl::desc("Basic block limit for a function for RDF optimizations"));
 
-static cl::opt<bool> DisableHardwareLoops("disable-hexagon-hwloops",
-  cl::Hidden, cl::desc("Disable Hardware Loops for Hexagon target"));
+static cl::opt<bool>
+    DisableHardwareLoops("disable-hexagon-hwloops", cl::Hidden,
+                         cl::desc("Disable Hardware Loops for Hexagon target"));
 
 static cl::opt<bool>
     DisableAModeOpt("disable-hexagon-amodeopt", cl::Hidden,
@@ -58,8 +59,9 @@ static cl::opt<bool>
     DisableHCP("disable-hcp", cl::Hidden,
                cl::desc("Disable Hexagon constant propagation"));
 
-static cl::opt<bool> DisableStoreWidening("disable-store-widen",
-  cl::Hidden, cl::init(false), cl::desc("Disable store widening"));
+static cl::opt<bool> DisableStoreWidening("disable-store-widen", cl::Hidden,
+                                          cl::init(false),
+                                          cl::desc("Disable store widening"));
 
 static cl::opt<bool> EnableExpandCondsets("hexagon-expand-condsets",
                                           cl::init(true), cl::Hidden,
@@ -72,42 +74,53 @@ static cl::opt<bool> EnableTfrCleanup("hexagon-tfr-cleanup", cl::init(true),
 static cl::opt<bool> EnableEarlyIf("hexagon-eif", cl::init(true), cl::Hidden,
                                    cl::desc("Enable early if-conversion"));
 
-static cl::opt<bool> EnableGenInsert("hexagon-insert", cl::init(true),
-  cl::Hidden, cl::desc("Generate \"insert\" instructions"));
+static cl::opt<bool> EnableCopyHoist("hexagon-copy-hoist", cl::init(true),
+                                     cl::Hidden, cl::ZeroOrMore,
+                                     cl::desc("Enable Hexagon copy hoisting"));
+
+static cl::opt<bool>
+    EnableGenInsert("hexagon-insert", cl::init(true), cl::Hidden,
+                    cl::desc("Generate \"insert\" instructions"));
 
 static cl::opt<bool>
     EnableCommGEP("hexagon-commgep", cl::init(true), cl::Hidden,
                   cl::desc("Enable commoning of GEP instructions"));
 
-static cl::opt<bool> EnableGenExtract("hexagon-extract", cl::init(true),
-  cl::Hidden, cl::desc("Generate \"extract\" instructions"));
+static cl::opt<bool>
+    EnableGenExtract("hexagon-extract", cl::init(true), cl::Hidden,
+                     cl::desc("Generate \"extract\" instructions"));
 
-static cl::opt<bool> EnableGenMux("hexagon-mux", cl::init(true), cl::Hidden,
-  cl::desc("Enable converting conditional transfers into MUX instructions"));
+static cl::opt<bool> EnableGenMux(
+    "hexagon-mux", cl::init(true), cl::Hidden,
+    cl::desc("Enable converting conditional transfers into MUX instructions"));
 
-static cl::opt<bool> EnableGenPred("hexagon-gen-pred", cl::init(true),
-  cl::Hidden, cl::desc("Enable conversion of arithmetic operations to "
-  "predicate instructions"));
+static cl::opt<bool>
+    EnableGenPred("hexagon-gen-pred", cl::init(true), cl::Hidden,
+                  cl::desc("Enable conversion of arithmetic operations to "
+                           "predicate instructions"));
 
 static cl::opt<bool>
     EnableLoopPrefetch("hexagon-loop-prefetch", cl::Hidden,
                        cl::desc("Enable loop data prefetch on Hexagon"));
 
-static cl::opt<bool> DisableHSDR("disable-hsdr", cl::init(false), cl::Hidden,
-  cl::desc("Disable splitting double registers"));
+static cl::opt<bool>
+    DisableHSDR("disable-hsdr", cl::init(false), cl::Hidden,
+                cl::desc("Disable splitting double registers"));
 
 static cl::opt<bool>
     EnableGenMemAbs("hexagon-mem-abs", cl::init(true), cl::Hidden,
                     cl::desc("Generate absolute set instructions"));
 
 static cl::opt<bool> EnableBitSimplify("hexagon-bit", cl::init(true),
-  cl::Hidden, cl::desc("Bit simplification"));
+                                       cl::Hidden,
+                                       cl::desc("Bit simplification"));
 
 static cl::opt<bool> EnableLoopResched("hexagon-loop-resched", cl::init(true),
-  cl::Hidden, cl::desc("Loop rescheduling"));
+                                       cl::Hidden,
+                                       cl::desc("Loop rescheduling"));
 
-static cl::opt<bool> HexagonNoOpt("hexagon-noopt", cl::init(false),
-  cl::Hidden, cl::desc("Disable backend optimizations"));
+static cl::opt<bool> HexagonNoOpt("hexagon-noopt", cl::init(false), cl::Hidden,
+                                  cl::desc("Disable backend optimizations"));
 
 static cl::opt<bool>
     EnableVectorPrint("enable-hexagon-vector-print", cl::Hidden,
@@ -148,69 +161,72 @@ static ScheduleDAGInstrs *createVLIWMachineSched(MachineSchedContext *C) {
 }
 
 static MachineSchedRegistry
-SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
-                    createVLIWMachineSched);
+    SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
+                        createVLIWMachineSched);
 
 namespace llvm {
-  extern char &HexagonExpandCondsetsID;
-  extern char &HexagonTfrCleanupID;
-  void initializeHexagonBitSimplifyPass(PassRegistry&);
-  void initializeHexagonConstExtendersPass(PassRegistry&);
-  void initializeHexagonConstPropagationPass(PassRegistry&);
-  void initializeHexagonCopyToCombinePass(PassRegistry&);
-  void initializeHexagonEarlyIfConversionPass(PassRegistry&);
-  void initializeHexagonExpandCondsetsPass(PassRegistry&);
-  void initializeHexagonGenMemAbsolutePass(PassRegistry &);
-  void initializeHexagonGenMuxPass(PassRegistry&);
-  void initializeHexagonHardwareLoopsPass(PassRegistry&);
-  void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &);
-  void initializeHexagonLoopAlignPass(PassRegistry &);
-  void initializeHexagonNewValueJumpPass(PassRegistry&);
-  void initializeHexagonOptAddrModePass(PassRegistry&);
-  void initializeHexagonPacketizerPass(PassRegistry&);
-  void initializeHexagonRDFOptPass(PassRegistry&);
-  void initializeHexagonSplitDoubleRegsPass(PassRegistry&);
-  void initializeHexagonTfrCleanupPass(PassRegistry &);
-  void initializeHexagonVExtractPass(PassRegistry &);
-  void initializeHexagonVectorCombineLegacyPass(PassRegistry&);
-  void initializeHexagonVectorLoopCarriedReuseLegacyPassPass(PassRegistry &);
-  Pass *createHexagonLoopIdiomPass();
-  Pass *createHexagonVectorLoopCarriedReuseLegacyPass();
-
-  FunctionPass *createHexagonBitSimplify();
-  FunctionPass *createHexagonBranchRelaxation();
-  FunctionPass *createHexagonCallFrameInformation();
-  FunctionPass *createHexagonCFGOptimizer();
-  FunctionPass *createHexagonCommonGEP();
-  FunctionPass *createHexagonConstExtenders();
-  FunctionPass *createHexagonConstPropagationPass();
-  FunctionPass *createHexagonCopyToCombine();
-  FunctionPass *createHexagonEarlyIfConversion();
-  FunctionPass *createHexagonFixupHwLoops();
-  FunctionPass *createHexagonGenExtract();
-  FunctionPass *createHexagonGenInsert();
-  FunctionPass *createHexagonGenMemAbsolute();
-  FunctionPass *createHexagonGenMux();
-  FunctionPass *createHexagonGenPredicate();
-  FunctionPass *createHexagonHardwareLoops();
-  FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
-                                     CodeGenOptLevel OptLevel);
-  FunctionPass *createHexagonLoopAlign();
-  FunctionPass *createHexagonLoopRescheduling();
-  FunctionPass *createHexagonNewValueJump();
-  FunctionPass *createHexagonOptAddrMode();
-  FunctionPass *createHexagonOptimizeSZextends();
-  FunctionPass *createHexagonPacketizer(bool Minimal);
-  FunctionPass *createHexagonPeephole();
-  FunctionPass *createHexagonRDFOpt();
-  FunctionPass *createHexagonSplitConst32AndConst64();
-  FunctionPass *createHexagonSplitDoubleRegs();
-  FunctionPass *createHexagonStoreWidening();
-  FunctionPass *createHexagonTfrCleanup();
-  FunctionPass *createHexagonVectorCombineLegacyPass();
-  FunctionPass *createHexagonVectorPrint();
-  FunctionPass *createHexagonVExtract();
-} // end namespace llvm;
+extern char &HexagonCopyHoistingID;
+extern char &HexagonExpandCondsetsID;
+extern char &HexagonTfrCleanupID;
+void initializeHexagonBitSimplifyPass(PassRegistry &);
+void initializeHexagonCopyHoistingPass(PassRegistry &);
+void initializeHexagonConstExtendersPass(PassRegistry &);
+void initializeHexagonConstPropagationPass(PassRegistry &);
+void initializeHexagonCopyToCombinePass(PassRegistry &);
+void initializeHexagonEarlyIfConversionPass(PassRegistry &);
+void initializeHexagonExpandCondsetsPass(PassRegistry &);
+void initializeHexagonGenMemAbsolutePass(PassRegistry &);
+void initializeHexagonGenMuxPass(PassRegistry &);
+void initializeHexagonHardwareLoopsPass(PassRegistry &);
+void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &);
+void initializeHexagonLoopAlignPass(PassRegistry &);
+void initializeHexagonNewValueJumpPass(PassRegistry &);
+void initializeHexagonOptAddrModePass(PassRegistry &);
+void initializeHexagonPacketizerPass(PassRegistry &);
+void initializeHexagonRDFOptPass(PassRegistry &);
+void initializeHexagonSplitDoubleRegsPass(PassRegistry &);
+void initializeHexagonTfrCleanupPass(PassRegistry &);
+void initializeHexagonVExtractPass(PassRegistry &);
+void initializeHexagonVectorCombineLegacyPass(PassRegistry &);
+void initializeHexagonVectorLoopCarriedReuseLegacyPassPass(PassRegistry &);
+Pass *createHexagonLoopIdiomPass();
+Pass *createHexagonVectorLoopCarriedReuseLegacyPass();
+
+FunctionPass *createHexagonBitSimplify();
+FunctionPass *createHexagonBranchRelaxation();
+FunctionPass *createHexagonCallFrameInformation();
+FunctionPass *createHexagonCFGOptimizer();
+FunctionPass *createHexagonCommonGEP();
+FunctionPass *createHexagonConstExtenders();
+FunctionPass *createHexagonConstPropagationPass();
+FunctionPass *createHexagonCopyHoisting();
+FunctionPass *createHexagonCopyToCombine();
+FunctionPass *createHexagonEarlyIfConversion();
+FunctionPass *createHexagonFixupHwLoops();
+FunctionPass *createHexagonGenExtract();
+FunctionPass *createHexagonGenInsert();
+FunctionPass *createHexagonGenMemAbsolute();
+FunctionPass *createHexagonGenMux();
+FunctionPass *createHexagonGenPredicate();
+FunctionPass *createHexagonHardwareLoops();
+FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
+                                   CodeGenOptLevel OptLevel);
+FunctionPass *createHexagonLoopAlign();
+FunctionPass *createHexagonLoopRescheduling();
+FunctionPass *createHexagonNewValueJump();
+FunctionPass *createHexagonOptAddrMode();
+FunctionPass *createHexagonOptimizeSZextends();
+FunctionPass *createHexagonPacketizer(bool Minimal);
+FunctionPass *createHexagonPeephole();
+FunctionPass *createHexagonRDFOpt();
+FunctionPass *createHexagonSplitConst32AndConst64();
+FunctionPass *createHexagonSplitDoubleRegs();
+FunctionPass *createHexagonStoreWidening();
+FunctionPass *createHexagonTfrCleanup();
+FunctionPass *createHexagonVectorCombineLegacyPass();
+FunctionPass *createHexagonVectorPrint();
+FunctionPass *createHexagonVExtract();
+} // namespace llvm
 
 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
   return RM.value_or(Reloc::Static);
@@ -260,6 +276,7 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
           (HexagonNoOpt ? CodeGenOptLevel::None : OL)),
       TLOF(std::make_unique<HexagonTargetObjectFile>()),
       Subtarget(Triple(TT), CPU, FS, *this) {
+  initializeHexagonCopyHoistingPass(*PassRegistry::getPassRegistry());
   initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
   initializeHexagonLoopAlignPass(*PassRegistry::getPassRegistry());
   initializeHexagonTfrCleanupPass(*PassRegistry::getPassRegistry());
@@ -269,10 +286,8 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
 const HexagonSubtarget *
 HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
   AttributeList FnAttrs = F.getAttributes();
-  Attribute CPUAttr =
-      FnAttrs.getFnAttr("target-cpu");
-  Attribute FSAttr =
-      FnAttrs.getFnAttr("target-features");
+  Attribute CPUAttr = FnAttrs.getFnAttr("target-cpu");
+  Attribute FSAttr = FnAttrs.getFnAttr("target-features");
 
   std::string CPU =
       CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
@@ -331,7 +346,7 @@ namespace {
 class HexagonPassConfig : public TargetPassConfig {
 public:
   HexagonPassConfig(HexagonTargetMachine &TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
+      : TargetPassConfig(TM, PM) {}
 
   HexagonTargetMachine &getHexagonTargetMachine() const {
     return getTM<HexagonTargetMachine>();
@@ -433,6 +448,8 @@ void HexagonPassConfig::addPreRegAlloc() {
       addPass(createHexagonConstExtenders());
     if (EnableExpandCondsets)
       insertPass(&RegisterCoalescerID, &HexagonExpandCondsetsID);
+    if (EnableCopyHoist)
+      insertPass(&RegisterCoalescerID, &HexagonCopyHoistingID);
     if (EnableTfrCleanup)
       insertPass(&VirtRegRewriterID, &HexagonTfrCleanupID);
     if (!DisableStoreWidening)
diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
index cecb4a50aa76..a6e40840517f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
+++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
@@ -31,21 +31,11 @@ class StringRef;
 
 class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
   virtual void anchor();
-  bool HasLA32 = false;
-  bool HasLA64 = false;
-  bool HasBasicF = false;
-  bool HasBasicD = false;
-  bool HasExtLSX = false;
-  bool HasExtLASX = false;
-  bool HasExtLVZ = false;
-  bool HasExtLBT = false;
-  bool HasLaGlobalWithPcrel = false;
-  bool HasLaGlobalWithAbs = false;
-  bool HasLaLocalWithAbs = false;
-  bool HasUAL = false;
-  bool HasLinkerRelax = false;
-  bool HasExpAutoVec = false;
-  bool HasFrecipe = false;
+
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
+  bool ATTRIBUTE = DEFAULT;
+#include "LoongArchGenSubtargetInfo.inc"
+
   unsigned GRLen = 32;
   MVT GRLenVT = MVT::i32;
   LoongArchABI::ABI TargetABI = LoongArchABI::ABI_Unknown;
@@ -92,20 +82,12 @@ public:
   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
+
+#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
+  bool GETTER() const { return ATTRIBUTE; }
+#include "LoongArchGenSubtargetInfo.inc"
+
   bool is64Bit() const { return HasLA64; }
-  bool hasBasicF() const { return HasBasicF; }
-  bool hasBasicD() const { return HasBasicD; }
-  bool hasExtLSX() const { return HasExtLSX; }
-  bool hasExtLASX() const { return HasExtLASX; }
-  bool hasExtLVZ() const { return HasExtLVZ; }
-  bool hasExtLBT() const { return HasExtLBT; }
-  bool hasLaGlobalWithPcrel() const { return HasLaGlobalWithPcrel; }
-  bool hasLaGlobalWithAbs() const { return HasLaGlobalWithAbs; }
-  bool hasLaLocalWithAbs() const { return HasLaLocalWithAbs; }
-  bool hasUAL() const { return HasUAL; }
-  bool hasLinkerRelax() const { return HasLinkerRelax; }
-  bool hasExpAutoVec() const { return HasExpAutoVec; }
-  bool hasFrecipe() const { return HasFrecipe; }
   MVT getGRLenVT() const { return GRLenVT; }
   unsigned getGRLen() const { return GRLen; }
   LoongArchABI::ABI getTargetABI() const { return TargetABI; }
diff --git a/llvm/lib/Target/M68k/M68kExpandPseudo.cpp b/llvm/lib/Target/M68k/M68kExpandPseudo.cpp
index 7fcc65beaa65..c7fdd7d7c350 100644
--- a/llvm/lib/Target/M68k/M68kExpandPseudo.cpp
+++ b/llvm/lib/Target/M68k/M68kExpandPseudo.cpp
@@ -80,6 +80,13 @@ bool M68kExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
   default:
     return false;
 
+  case M68k::MOVI8di:
+    return TII->ExpandMOVI(MIB, MVT::i8);
+  case M68k::MOVI16ri:
+    return TII->ExpandMOVI(MIB, MVT::i16);
+  case M68k::MOVI32ri:
+    return TII->ExpandMOVI(MIB, MVT::i32);
+
   case M68k::MOVXd16d8:
     return TII->ExpandMOVX_RR(MIB, MVT::i16, MVT::i8);
   case M68k::MOVXd32d8:
diff --git a/llvm/lib/Target/M68k/M68kInstrData.td b/llvm/lib/Target/M68k/M68kInstrData.td
index fa7e7aa0ed46..dc777a933e27 100644
--- a/llvm/lib/Target/M68k/M68kInstrData.td
+++ b/llvm/lib/Target/M68k/M68kInstrData.td
@@ -19,7 +19,7 @@
 ///
 ///  Pseudo:
 ///
-///     MOVSX [x]     MOVZX [x]     MOVX   [x]
+///     MOVI  [x]     MOVSX [x]     MOVZX [x]     MOVX   [x]
 ///
 ///  Map:
 ///
@@ -165,11 +165,12 @@ foreach AM = MxMoveSupportedAMs in {
 } // foreach AM
 
 // R <- I
+// No pattern, as all immediate -> register moves are matched to the MOVI pseudo
 class MxMove_RI<MxType TYPE, string DST_REG, MxMoveEncoding ENC,
                 MxImmOpBundle SRC = !cast<MxImmOpBundle>("MxOp"#TYPE.Size#"AddrMode_i"),
                 MxOpBundle DST = !cast<MxOpBundle>("MxOp"#TYPE.Size#"AddrMode_"#DST_REG)>
     : MxMove<TYPE.Prefix, (outs DST.Op:$dst), (ins SRC.Op:$src),
-              [(set TYPE.VT:$dst, SRC.ImmPat:$src)], ENC>;
+              [(null_frag)], ENC>;
 
 foreach REG = ["r", "a", "d"] in {
   foreach TYPE = !if(!eq(REG, "d"), [MxType8, MxType16, MxType32], [MxType16, MxType32]) in
@@ -243,6 +244,24 @@ def : Pat<(store   MxType32.BPat :$src, MxType32.JPat :$dst),
           (MOV32ji MxType32.JOp  :$dst, MxType32.IOp  :$src)>;
 
 //===----------------------------------------------------------------------===//
+// MOVEQ
+//===----------------------------------------------------------------------===//
+
+/// ------------+---------+---+-----------------------
+///  F  E  D  C | B  A  9 | 8 | 7  6  5  4  3  2  1  0
+/// ------------+---------+---+-----------------------
+///  0  1  1  1 |   REG   | 0 |          DATA
+/// ------------+---------+---+-----------------------
+
+// No pattern, as all immediate -> register moves are matched to the MOVI pseudo
+let Defs = [CCR] in
+def MOVQ : MxInst<(outs MxDRD32:$dst), (ins Mxi8imm:$imm),
+                  "moveq\t$imm, $dst",
+                  [(null_frag)]> {
+  let Inst = (descend 0b0111, (operand "$dst", 3), 0b0, (operand "$imm", 8));
+}
+
+//===----------------------------------------------------------------------===//
 // MOVEM
 //
 // The mask is already pre-processed by the save/restore spill hook
@@ -496,7 +515,23 @@ class MxPseudoMove_RR<MxType DST, MxType SRC, list<dag> PAT = []>
 
 class MxPseudoMove_RM<MxType DST, MxOperand SRCOpd, list<dag> PAT = []>
     : MxPseudo<(outs DST.ROp:$dst), (ins SRCOpd:$src), PAT>;
-}
+
+
+// These Pseudos handle loading immediates to registers.
+// They are expanded post-RA into either move or moveq instructions,
+// depending on size, destination register class, and immediate value.
+// This is done with pseudoinstructions in order to not constrain RA to
+// data registers if moveq matches.
+class MxPseudoMove_DI<MxType TYPE>
+    : MxPseudo<(outs TYPE.ROp:$dst), (ins TYPE.IOp:$src),
+               [(set TYPE.ROp:$dst, imm:$src)]>;
+
+// i8 imm -> reg can always be converted to moveq,
+// but we still emit a pseudo for consistency.
+def MOVI8di  : MxPseudoMove_DI<MxType8d>;
+def MOVI16ri : MxPseudoMove_DI<MxType16r>;
+def MOVI32ri : MxPseudoMove_DI<MxType32r>;
+} // let Defs = [CCR]
 
 /// This group of Pseudos is analogues to the real x86 extending moves, but
 /// since M68k does not have those we need to emulate. These instructions
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.cpp b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
index d56fef9e9029..338db45782c9 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.cpp
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
@@ -346,6 +346,40 @@ void M68kInstrInfo::AddZExt(MachineBasicBlock &MBB,
   BuildMI(MBB, I, DL, get(And), Reg).addReg(Reg).addImm(Mask);
 }
 
+// Convert MOVI to MOVQ if the target is a data register and the immediate
+// fits in a sign-extended i8, otherwise emit a plain MOV.
+bool M68kInstrInfo::ExpandMOVI(MachineInstrBuilder &MIB, MVT MVTSize) const {
+  Register Reg = MIB->getOperand(0).getReg();
+  int64_t Imm = MIB->getOperand(1).getImm();
+  bool IsAddressReg = false;
+
+  const auto *DR32 = RI.getRegClass(M68k::DR32RegClassID);
+  const auto *AR32 = RI.getRegClass(M68k::AR32RegClassID);
+  const auto *AR16 = RI.getRegClass(M68k::AR16RegClassID);
+
+  if (AR16->contains(Reg) || AR32->contains(Reg))
+    IsAddressReg = true;
+
+  LLVM_DEBUG(dbgs() << "Expand " << *MIB.getInstr() << " to ");
+
+  if (MVTSize == MVT::i8 || (!IsAddressReg && Imm >= -128 && Imm <= 127)) {
+    LLVM_DEBUG(dbgs() << "MOVEQ\n");
+
+    // We need to assign to the full register to make IV happy
+    Register SReg =
+        MVTSize == MVT::i32 ? Reg : Register(RI.getMatchingMegaReg(Reg, DR32));
+    assert(SReg && "No viable MEGA register available");
+
+    MIB->setDesc(get(M68k::MOVQ));
+    MIB->getOperand(0).setReg(SReg);
+  } else {
+    LLVM_DEBUG(dbgs() << "MOVE\n");
+    MIB->setDesc(get(MVTSize == MVT::i16 ? M68k::MOV16ri : M68k::MOV32ri));
+  }
+
+  return true;
+}
+
 bool M68kInstrInfo::ExpandMOVX_RR(MachineInstrBuilder &MIB, MVT MVTDst,
                                   MVT MVTSrc) const {
   unsigned Move = MVTDst == MVT::i16 ? M68k::MOV16rr : M68k::MOV32rr;
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.h b/llvm/lib/Target/M68k/M68kInstrInfo.h
index 577967f2fdfc..d1e1e1cd9998 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.h
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.h
@@ -302,6 +302,9 @@ public:
   void AddZExt(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                DebugLoc DL, unsigned Reg, MVT From, MVT To) const;
 
+  /// Move immediate to register
+  bool ExpandMOVI(MachineInstrBuilder &MIB, MVT MVTSize) const;
+
   /// Move across register classes without extension
   bool ExpandMOVX_RR(MachineInstrBuilder &MIB, MVT MVTDst, MVT MVTSrc) const;
 
diff --git a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
index bef7607118ce..f609305bfee4 100644
--- a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -1117,6 +1117,22 @@ def : MipsPat<(select i32:$cond, immz, i32:$f),
               ISA_MIPS32R6;
 }
 
+// llvm.fmin/fmax operations.
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(fmaxnum f32:$lhs, f32:$rhs),
+                (MAX_S   f32:$lhs, f32:$rhs)>,
+                ISA_MIPS32R6;
+  def : MipsPat<(fmaxnum f64:$lhs, f64:$rhs),
+                (MAX_D   f64:$lhs, f64:$rhs)>,
+                ISA_MIPS32R6;
+  def : MipsPat<(fminnum f32:$lhs, f32:$rhs),
+                (MIN_S   f32:$lhs, f32:$rhs)>,
+                ISA_MIPS32R6;
+  def : MipsPat<(fminnum f64:$lhs, f64:$rhs),
+                (MIN_D   f64:$lhs, f64:$rhs)>,
+                ISA_MIPS32R6;
+}
+
 // Pseudo instructions
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1,
     hasExtraSrcRegAllocReq = 1, isCTI = 1, Defs = [AT], hasPostISelHook = 1 in {
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 7bc66b2d9f4b..8f7c47370ee5 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -358,6 +358,15 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::FCOPYSIGN,          MVT::f64,   Custom);
   setOperationAction(ISD::FP_TO_SINT,         MVT::i32,   Custom);
 
+  // Lower fmin and fmax operations for MIPS R6.
+  // Instructions are defined but never used.
+  if (Subtarget.hasMips32r6() || Subtarget.hasMips64r6()) {
+    setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+  }
+
   if (Subtarget.isGP64bit()) {
     setOperationAction(ISD::GlobalAddress,      MVT::i64,   Custom);
     setOperationAction(ISD::BlockAddress,       MVT::i64,   Custom);
diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index 494e4b52a5b5..c6db8a7bbeb8 100644
--- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -45,6 +45,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
 
 using namespace llvm;
 
@@ -95,6 +96,13 @@ static cl::opt<bool>
                            cl::desc("enable optimization of conditional traps"),
                            cl::init(false), cl::Hidden);
 
+DEBUG_COUNTER(
+    PeepholeXToICounter, "ppc-xtoi-peephole",
+    "Controls whether PPC reg+reg to reg+imm peephole is performed on a MI");
+
+DEBUG_COUNTER(PeepholePerOpCounter, "ppc-per-op-peephole",
+              "Controls whether PPC per opcode peephole is performed on a MI");
+
 namespace {
 
 struct PPCMIPeephole : public MachineFunctionPass {
@@ -469,6 +477,9 @@ bool PPCMIPeephole::simplifyCode() {
           if (MI.isDebugInstr())
             continue;
 
+          if (!DebugCounter::shouldExecute(PeepholeXToICounter))
+            continue;
+
           SmallSet<Register, 4> RRToRIRegsToUpdate;
           if (!TII->convertToImmediateForm(MI, RRToRIRegsToUpdate))
             continue;
@@ -538,6 +549,9 @@ bool PPCMIPeephole::simplifyCode() {
       if (MI.isDebugInstr())
         continue;
 
+      if (!DebugCounter::shouldExecute(PeepholePerOpCounter))
+        continue;
+
       // Per-opcode peepholes.
       switch (MI.getOpcode()) {
 
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 998b9181efe6..b9e8e1f33d3a 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -44,6 +44,13 @@ public:
 
 private:
   void addSPOperands(MCInst &MI) const;
+
+  DecodeStatus getInstruction32(MCInst &Instr, uint64_t &Size,
+                                ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                raw_ostream &CStream) const;
+  DecodeStatus getInstruction16(MCInst &Instr, uint64_t &Size,
+                                ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                raw_ostream &CStream) const;
 };
 } // end anonymous namespace
 
@@ -182,7 +189,7 @@ static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, uint32_t RegNo,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeSR07RegisterClass(MCInst &Inst, uint64_t RegNo,
+static DecodeStatus DecodeSR07RegisterClass(MCInst &Inst, uint32_t RegNo,
                                             uint64_t Address,
                                             const void *Decoder) {
   if (RegNo >= 8)
@@ -255,12 +262,12 @@ static DecodeStatus DecodeVRM8RegisterClass(MCInst &Inst, uint32_t RegNo,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeVMaskReg(MCInst &Inst, uint64_t RegNo,
+static DecodeStatus decodeVMaskReg(MCInst &Inst, uint32_t RegNo,
                                    uint64_t Address,
                                    const MCDisassembler *Decoder) {
-  if (RegNo > 2) {
+  if (RegNo >= 2)
     return MCDisassembler::Fail;
-  }
+
   MCRegister Reg = (RegNo == 0) ? RISCV::V0 : RISCV::NoRegister;
 
   Inst.addOperand(MCOperand::createReg(Reg));
@@ -361,13 +368,13 @@ static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
                                         uint64_t Address,
                                         const MCDisassembler *Decoder);
 
-static DecodeStatus decodeZcmpRlist(MCInst &Inst, unsigned Imm,
+static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm,
                                     uint64_t Address, const void *Decoder);
 
 static DecodeStatus decodeRegReg(MCInst &Inst, uint32_t Insn, uint64_t Address,
                                  const MCDisassembler *Decoder);
 
-static DecodeStatus decodeZcmpSpimm(MCInst &Inst, unsigned Imm,
+static DecodeStatus decodeZcmpSpimm(MCInst &Inst, uint32_t Imm,
                                     uint64_t Address, const void *Decoder);
 
 static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn,
@@ -470,7 +477,7 @@ static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeZcmpRlist(MCInst &Inst, unsigned Imm,
+static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm,
                                     uint64_t Address, const void *Decoder) {
   if (Imm <= 3)
     return MCDisassembler::Fail;
@@ -487,7 +494,7 @@ static DecodeStatus decodeRegReg(MCInst &Inst, uint32_t Insn, uint64_t Address,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus decodeZcmpSpimm(MCInst &Inst, unsigned Imm,
+static DecodeStatus decodeZcmpSpimm(MCInst &Inst, uint32_t Imm,
                                     uint64_t Address, const void *Decoder) {
   Inst.addOperand(MCOperand::createImm(Imm));
   return MCDisassembler::Success;
@@ -502,21 +509,13 @@ void RISCVDisassembler::addSPOperands(MCInst &MI) const {
       MI.insert(MI.begin() + i, MCOperand::createReg(RISCV::X2));
 }
 
-DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
-                                               ArrayRef<uint8_t> Bytes,
-                                               uint64_t Address,
-                                               raw_ostream &CS) const {
-  // TODO: This will need modification when supporting instruction set
-  // extensions with instructions > 32-bits (up to 176 bits wide).
-  uint32_t Insn;
-  DecodeStatus Result;
-
 #define TRY_TO_DECODE_WITH_ADDITIONAL_OPERATION(FEATURE_CHECKS, DECODER_TABLE, \
                                                 DESC, ADDITIONAL_OPERATION)    \
   do {                                                                         \
     if (FEATURE_CHECKS) {                                                      \
       LLVM_DEBUG(dbgs() << "Trying " DESC ":\n");                              \
-      Result = decodeInstruction(DECODER_TABLE, MI, Insn, Address, this, STI); \
+      DecodeStatus Result =                                                    \
+          decodeInstruction(DECODER_TABLE, MI, Insn, Address, this, STI);      \
       if (Result != MCDisassembler::Fail) {                                    \
         ADDITIONAL_OPERATION;                                                  \
         return Result;                                                         \
@@ -532,104 +531,111 @@ DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 #define TRY_TO_DECODE_FEATURE(FEATURE, DECODER_TABLE, DESC)                    \
   TRY_TO_DECODE(STI.hasFeature(FEATURE), DECODER_TABLE, DESC)
 
-  // It's a 32 bit instruction if bit 0 and 1 are 1.
-  if ((Bytes[0] & 0x3) == 0x3) {
-    if (Bytes.size() < 4) {
-      Size = 0;
-      return MCDisassembler::Fail;
-    }
-    Size = 4;
-
-    Insn = support::endian::read32le(Bytes.data());
-
-    TRY_TO_DECODE(STI.hasFeature(RISCV::FeatureStdExtZdinx) &&
-                      !STI.hasFeature(RISCV::Feature64Bit),
-                  DecoderTableRV32Zdinx32,
-                  "RV32Zdinx table (Double in Integer and rv32)");
-    TRY_TO_DECODE(STI.hasFeature(RISCV::FeatureStdExtZacas) &&
-                      !STI.hasFeature(RISCV::Feature64Bit),
-                  DecoderTableRV32Zacas32,
-                  "RV32Zacas table (Compare-And-Swap and rv32)");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureStdExtZfinx, DecoderTableRVZfinx32,
-                          "RVZfinx table (Float in Integer)");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXVentanaCondOps,
-                          DecoderTableXVentana32, "Ventana custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadBa, DecoderTableXTHeadBa32,
-                          "XTHeadBa custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadBb, DecoderTableXTHeadBb32,
-                          "XTHeadBb custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadBs, DecoderTableXTHeadBs32,
-                          "XTHeadBs custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadCondMov,
-                          DecoderTableXTHeadCondMov32,
-                          "XTHeadCondMov custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadCmo, DecoderTableXTHeadCmo32,
-                          "XTHeadCmo custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadFMemIdx,
-                          DecoderTableXTHeadFMemIdx32,
-                          "XTHeadFMemIdx custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadMac, DecoderTableXTHeadMac32,
-                          "XTHeadMac custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadMemIdx,
-                          DecoderTableXTHeadMemIdx32,
-                          "XTHeadMemIdx custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadMemPair,
-                          DecoderTableXTHeadMemPair32,
-                          "XTHeadMemPair custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadSync,
-                          DecoderTableXTHeadSync32,
-                          "XTHeadSync custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadVdot, DecoderTableXTHeadVdot32,
-                          "XTHeadVdot custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXSfvcp, DecoderTableXSfvcp32,
-                          "SiFive VCIX custom opcode table");
-    TRY_TO_DECODE_FEATURE(
-        RISCV::FeatureVendorXSfvqmaccdod, DecoderTableXSfvqmaccdod32,
-        "SiFive Matrix Multiplication (2x8 and 8x2) Instruction opcode table");
-    TRY_TO_DECODE_FEATURE(
-        RISCV::FeatureVendorXSfvqmaccqoq, DecoderTableXSfvqmaccqoq32,
-        "SiFive Matrix Multiplication (4x8 and 8x4) Instruction opcode table");
-    TRY_TO_DECODE_FEATURE(
-        RISCV::FeatureVendorXSfvfwmaccqqq, DecoderTableXSfvfwmaccqqq32,
-        "SiFive Matrix Multiplication Instruction opcode table");
-    TRY_TO_DECODE_FEATURE(
-        RISCV::FeatureVendorXSfvfnrclipxfqf, DecoderTableXSfvfnrclipxfqf32,
-        "SiFive FP32-to-int8 Ranged Clip Instructions opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXSiFivecdiscarddlone,
-                          DecoderTableXSiFivecdiscarddlone32,
-                          "SiFive sf.cdiscard.d.l1 custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXSiFivecflushdlone,
-                          DecoderTableXSiFivecflushdlone32,
-                          "SiFive sf.cflush.d.l1 custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXSfcease, DecoderTableXSfcease32,
-                          "SiFive sf.cease custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVbitmanip,
-                          DecoderTableXCVbitmanip32,
-                          "CORE-V Bit Manipulation custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVelw, DecoderTableXCVelw32,
-                          "CORE-V Event load custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVmac, DecoderTableXCVmac32,
-                          "CORE-V MAC custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVmem, DecoderTableXCVmem32,
-                          "CORE-V MEM custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCValu, DecoderTableXCValu32,
-                          "CORE-V ALU custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVsimd, DecoderTableXCVsimd32,
-                          "CORE-V SIMD extensions custom opcode table");
-    TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVbi, DecoderTableXCVbi32,
-                          "CORE-V Immediate Branching custom opcode table");
-    TRY_TO_DECODE(true, DecoderTable32, "RISCV32 table");
-
+DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
+                                                 ArrayRef<uint8_t> Bytes,
+                                                 uint64_t Address,
+                                                 raw_ostream &CS) const {
+  if (Bytes.size() < 4) {
+    Size = 0;
     return MCDisassembler::Fail;
   }
+  Size = 4;
+
+  uint32_t Insn = support::endian::read32le(Bytes.data());
+
+  TRY_TO_DECODE(STI.hasFeature(RISCV::FeatureStdExtZdinx) &&
+                    !STI.hasFeature(RISCV::Feature64Bit),
+                DecoderTableRV32Zdinx32,
+                "RV32Zdinx table (Double in Integer and rv32)");
+  TRY_TO_DECODE(STI.hasFeature(RISCV::FeatureStdExtZacas) &&
+                    !STI.hasFeature(RISCV::Feature64Bit),
+                DecoderTableRV32Zacas32,
+                "RV32Zacas table (Compare-And-Swap and rv32)");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureStdExtZfinx, DecoderTableRVZfinx32,
+                        "RVZfinx table (Float in Integer)");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXVentanaCondOps,
+                        DecoderTableXVentana32, "Ventana custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadBa, DecoderTableXTHeadBa32,
+                        "XTHeadBa custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadBb, DecoderTableXTHeadBb32,
+                        "XTHeadBb custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadBs, DecoderTableXTHeadBs32,
+                        "XTHeadBs custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadCondMov,
+                        DecoderTableXTHeadCondMov32,
+                        "XTHeadCondMov custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadCmo, DecoderTableXTHeadCmo32,
+                        "XTHeadCmo custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadFMemIdx,
+                        DecoderTableXTHeadFMemIdx32,
+                        "XTHeadFMemIdx custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadMac, DecoderTableXTHeadMac32,
+                        "XTHeadMac custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadMemIdx,
+                        DecoderTableXTHeadMemIdx32,
+                        "XTHeadMemIdx custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadMemPair,
+                        DecoderTableXTHeadMemPair32,
+                        "XTHeadMemPair custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadSync,
+                        DecoderTableXTHeadSync32,
+                        "XTHeadSync custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXTHeadVdot,
+                        DecoderTableXTHeadVdot32,
+                        "XTHeadVdot custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXSfvcp, DecoderTableXSfvcp32,
+                        "SiFive VCIX custom opcode table");
+  TRY_TO_DECODE_FEATURE(
+      RISCV::FeatureVendorXSfvqmaccdod, DecoderTableXSfvqmaccdod32,
+      "SiFive Matrix Multiplication (2x8 and 8x2) Instruction opcode table");
+  TRY_TO_DECODE_FEATURE(
+      RISCV::FeatureVendorXSfvqmaccqoq, DecoderTableXSfvqmaccqoq32,
+      "SiFive Matrix Multiplication (4x8 and 8x4) Instruction opcode table");
+  TRY_TO_DECODE_FEATURE(
+      RISCV::FeatureVendorXSfvfwmaccqqq, DecoderTableXSfvfwmaccqqq32,
+      "SiFive Matrix Multiplication Instruction opcode table");
+  TRY_TO_DECODE_FEATURE(
+      RISCV::FeatureVendorXSfvfnrclipxfqf, DecoderTableXSfvfnrclipxfqf32,
+      "SiFive FP32-to-int8 Ranged Clip Instructions opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXSiFivecdiscarddlone,
+                        DecoderTableXSiFivecdiscarddlone32,
+                        "SiFive sf.cdiscard.d.l1 custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXSiFivecflushdlone,
+                        DecoderTableXSiFivecflushdlone32,
+                        "SiFive sf.cflush.d.l1 custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXSfcease, DecoderTableXSfcease32,
+                        "SiFive sf.cease custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVbitmanip,
+                        DecoderTableXCVbitmanip32,
+                        "CORE-V Bit Manipulation custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVelw, DecoderTableXCVelw32,
+                        "CORE-V Event load custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVmac, DecoderTableXCVmac32,
+                        "CORE-V MAC custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVmem, DecoderTableXCVmem32,
+                        "CORE-V MEM custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCValu, DecoderTableXCValu32,
+                        "CORE-V ALU custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVsimd, DecoderTableXCVsimd32,
+                        "CORE-V SIMD extensions custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVbi, DecoderTableXCVbi32,
+                        "CORE-V Immediate Branching custom opcode table");
+  TRY_TO_DECODE(true, DecoderTable32, "RISCV32 table");
 
+  return MCDisassembler::Fail;
+}
+
+DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size,
+                                                 ArrayRef<uint8_t> Bytes,
+                                                 uint64_t Address,
+                                                 raw_ostream &CS) const {
   if (Bytes.size() < 2) {
     Size = 0;
     return MCDisassembler::Fail;
   }
   Size = 2;
 
-  Insn = support::endian::read16le(Bytes.data());
+  uint32_t Insn = support::endian::read16le(Bytes.data());
   TRY_TO_DECODE_AND_ADD_SP(!STI.hasFeature(RISCV::Feature64Bit),
                            DecoderTableRISCV32Only_16,
                            "RISCV32Only_16 table (16-bit Instruction)");
@@ -645,3 +651,49 @@ DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
   return MCDisassembler::Fail;
 }
+
+DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                               ArrayRef<uint8_t> Bytes,
+                                               uint64_t Address,
+                                               raw_ostream &CS) const {
+  // It's a 16 bit instruction if bit 0 and 1 are not 0b11.
+  if ((Bytes[0] & 0b11) != 0b11)
+    return getInstruction16(MI, Size, Bytes, Address, CS);
+
+  // It's a 32 bit instruction if bit 1:0 are 0b11(checked above) and bits 4:2
+  // are not 0b111.
+  if ((Bytes[0] & 0b1'1100) != 0b1'1100)
+    return getInstruction32(MI, Size, Bytes, Address, CS);
+
+  // 48-bit instructions are encoded as 0bxx011111.
+  if ((Bytes[0] & 0b11'1111) == 0b01'1111) {
+    Size = Bytes.size() >= 6 ? 6 : 0;
+    return MCDisassembler::Fail;
+  }
+
+  // 64-bit instructions are encoded as 0x0111111.
+  if ((Bytes[0] & 0b111'1111) == 0b011'1111) {
+    Size = Bytes.size() >= 8 ? 8 : 0;
+    return MCDisassembler::Fail;
+  }
+
+  // Remaining cases need to check a second byte.
+  if (Bytes.size() < 2) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // 80-bit through 176-bit instructions are encoded as 0bxnnnxxxx_x1111111.
+  // Where the number of bits is (80 + (nnn * 16)) for nnn != 0b111.
+  unsigned nnn = (Bytes[1] >> 4) & 0b111;
+  if (nnn != 0b111) {
+    Size = 10 + (nnn * 2);
+    if (Bytes.size() < Size)
+      Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // Remaining encodings are reserved for > 176-bit instructions.
+  Size = 0;
+  return MCDisassembler::Fail;
+}
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index 9fb84efd5b6f..09f496574d64 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -15,6 +15,12 @@ include "llvm/Target/Target.td"
 include "RISCVFeatures.td"
 
 //===----------------------------------------------------------------------===//
+// RISC-V profiles supported.
+//===----------------------------------------------------------------------===//
+
+include "RISCVProfiles.td"
+
+//===----------------------------------------------------------------------===//
 // Named operands for CSR instructions.
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 539aa3525545..68f4ec5ef49f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -524,8 +524,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     if (Subtarget.is64Bit())
       setOperationAction(ISD::FPOWI, MVT::i32, Custom);
 
-    if (!Subtarget.hasStdExtZfa())
-      setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
+    setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16,
+                       Subtarget.hasStdExtZfa() ? Legal : Custom);
   }
 
   if (Subtarget.hasStdExtFOrZfinx()) {
@@ -548,10 +548,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Custom);
 
-    if (Subtarget.hasStdExtZfa())
+    if (Subtarget.hasStdExtZfa()) {
       setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
-    else
+      setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
+    } else {
       setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Custom);
+    }
   }
 
   if (Subtarget.hasStdExtFOrZfinx() && Subtarget.is64Bit())
@@ -566,6 +568,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     if (Subtarget.hasStdExtZfa()) {
       setOperationAction(FPRndMode, MVT::f64, Legal);
       setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
+      setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f64, Legal);
     } else {
       if (Subtarget.is64Bit())
         setOperationAction(FPRndMode, MVT::f64, Custom);
@@ -16164,23 +16167,39 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SELECT:
     return performSELECTCombine(N, DAG, Subtarget);
   case RISCVISD::CZERO_EQZ:
-  case RISCVISD::CZERO_NEZ:
-    // czero_eq X, (xor Y, 1) -> czero_ne X, Y if Y is 0 or 1.
-    // czero_ne X, (xor Y, 1) -> czero_eq X, Y if Y is 0 or 1.
-    if (N->getOperand(1).getOpcode() == ISD::XOR &&
-        isOneConstant(N->getOperand(1).getOperand(1))) {
-      SDValue Cond = N->getOperand(1).getOperand(0);
-      APInt Mask = APInt::getBitsSetFrom(Cond.getValueSizeInBits(), 1);
-      if (DAG.MaskedValueIsZero(Cond, Mask)) {
-        unsigned NewOpc = N->getOpcode() == RISCVISD::CZERO_EQZ
-                              ? RISCVISD::CZERO_NEZ
-                              : RISCVISD::CZERO_EQZ;
-        return DAG.getNode(NewOpc, SDLoc(N), N->getValueType(0),
-                           N->getOperand(0), Cond);
-      }
+  case RISCVISD::CZERO_NEZ: {
+    SDValue Val = N->getOperand(0);
+    SDValue Cond = N->getOperand(1);
+
+    unsigned Opc = N->getOpcode();
+
+    // czero_eqz x, x -> x
+    if (Opc == RISCVISD::CZERO_EQZ && Val == Cond)
+      return Val;
+
+    unsigned InvOpc =
+        Opc == RISCVISD::CZERO_EQZ ? RISCVISD::CZERO_NEZ : RISCVISD::CZERO_EQZ;
+
+    // czero_eqz X, (xor Y, 1) -> czero_nez X, Y if Y is 0 or 1.
+    // czero_nez X, (xor Y, 1) -> czero_eqz X, Y if Y is 0 or 1.
+    if (Cond.getOpcode() == ISD::XOR && isOneConstant(Cond.getOperand(1))) {
+      SDValue NewCond = Cond.getOperand(0);
+      APInt Mask = APInt::getBitsSetFrom(NewCond.getValueSizeInBits(), 1);
+      if (DAG.MaskedValueIsZero(NewCond, Mask))
+        return DAG.getNode(InvOpc, SDLoc(N), N->getValueType(0), Val, NewCond);
+    }
+    // czero_eqz x, (setcc y, 0, ne) -> czero_eqz x, y
+    // czero_nez x, (setcc y, 0, ne) -> czero_nez x, y
+    // czero_eqz x, (setcc y, 0, eq) -> czero_nez x, y
+    // czero_nez x, (setcc y, 0, eq) -> czero_eqz x, y
+    if (Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
+      ISD::CondCode CCVal = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+      if (ISD::isIntEqualitySetCC(CCVal))
+        return DAG.getNode(CCVal == ISD::SETNE ? Opc : InvOpc, SDLoc(N),
+                           N->getValueType(0), Val, Cond.getOperand(0));
     }
     return SDValue();
-
+  }
   case RISCVISD::SELECT_CC: {
     // Transform
     SDValue LHS = N->getOperand(0);
@@ -21004,7 +21023,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
   IRBuilder<> Builder(LI);
 
   // Only deinterleave2 supported at present.
-  if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
+  if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
     return false;
 
   unsigned Factor = 2;
@@ -21054,7 +21073,7 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
   IRBuilder<> Builder(SI);
 
   // Only interleave2 supported at present.
-  if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
+  if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
     return false;
 
   unsigned Factor = 2;
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 3d598dd6f708..b27e1dd258eb 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -156,7 +156,7 @@ static std::optional<unsigned> getEEWForLoadStore(const MachineInstr &MI) {
   }
 }
 
-static bool isNonZeroLoadImmediate(MachineInstr &MI) {
+static bool isNonZeroLoadImmediate(const MachineInstr &MI) {
   return MI.getOpcode() == RISCV::ADDI &&
     MI.getOperand(1).isReg() && MI.getOperand(2).isImm() &&
     MI.getOperand(1).getReg() == RISCV::X0 &&
@@ -262,6 +262,17 @@ struct DemandedFields {
     VLZeroness = true;
   }
 
+  // Make this the result of demanding both the fields in this and B.
+  void doUnion(const DemandedFields &B) {
+    VLAny |= B.VLAny;
+    VLZeroness |= B.VLZeroness;
+    SEW = std::max(SEW, B.SEW);
+    LMUL |= B.LMUL;
+    SEWLMULRatio |= B.SEWLMULRatio;
+    TailPolicy |= B.TailPolicy;
+    MaskPolicy |= B.MaskPolicy;
+  }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Support for debugging, callable in GDB: V->dump()
   LLVM_DUMP_METHOD void dump() const {
@@ -443,8 +454,12 @@ DemandedFields getDemanded(const MachineInstr &MI,
 /// Defines the abstract state with which the forward dataflow models the
 /// values of the VL and VTYPE registers after insertion.
 class VSETVLIInfo {
+  struct AVLDef {
+    const MachineInstr *DefMI;
+    Register DefReg;
+  };
   union {
-    Register AVLReg;
+    AVLDef AVLRegDef;
     unsigned AVLImm;
   };
 
@@ -479,9 +494,10 @@ public:
   void setUnknown() { State = Unknown; }
   bool isUnknown() const { return State == Unknown; }
 
-  void setAVLReg(Register Reg) {
-    assert(Reg.isVirtual());
-    AVLReg = Reg;
+  void setAVLRegDef(const MachineInstr *DefMI, Register AVLReg) {
+    assert(DefMI && AVLReg.isVirtual());
+    AVLRegDef.DefMI = DefMI;
+    AVLRegDef.DefReg = AVLReg;
     State = AVLIsReg;
   }
 
@@ -499,20 +515,24 @@ public:
   bool hasAVLVLMAX() const { return State == AVLIsVLMAX; }
   bool hasAVLIgnored() const { return State == AVLIsIgnored; }
   Register getAVLReg() const {
-    assert(hasAVLReg());
-    return AVLReg;
+    assert(hasAVLReg() && AVLRegDef.DefReg.isVirtual());
+    return AVLRegDef.DefReg;
   }
   unsigned getAVLImm() const {
     assert(hasAVLImm());
     return AVLImm;
   }
+  const MachineInstr &getAVLDefMI() const {
+    assert(hasAVLReg() && AVLRegDef.DefMI);
+    return *AVLRegDef.DefMI;
+  }
 
   void setAVL(VSETVLIInfo Info) {
     assert(Info.isValid());
     if (Info.isUnknown())
       setUnknown();
     else if (Info.hasAVLReg())
-      setAVLReg(Info.getAVLReg());
+      setAVLRegDef(&Info.getAVLDefMI(), Info.getAVLReg());
     else if (Info.hasAVLVLMAX())
       setAVLVLMAX();
     else if (Info.hasAVLIgnored())
@@ -528,14 +548,11 @@ public:
   bool getTailAgnostic() const { return TailAgnostic; }
   bool getMaskAgnostic() const { return MaskAgnostic; }
 
-  bool hasNonZeroAVL(const MachineRegisterInfo &MRI) const {
+  bool hasNonZeroAVL() const {
     if (hasAVLImm())
       return getAVLImm() > 0;
-    if (hasAVLReg()) {
-      MachineInstr *MI = MRI.getUniqueVRegDef(getAVLReg());
-      assert(MI);
-      return isNonZeroLoadImmediate(*MI);
-    }
+    if (hasAVLReg())
+      return isNonZeroLoadImmediate(getAVLDefMI());
     if (hasAVLVLMAX())
       return true;
     if (hasAVLIgnored())
@@ -543,16 +560,16 @@ public:
     return false;
   }
 
-  bool hasEquallyZeroAVL(const VSETVLIInfo &Other,
-                         const MachineRegisterInfo &MRI) const {
+  bool hasEquallyZeroAVL(const VSETVLIInfo &Other) const {
     if (hasSameAVL(Other))
       return true;
-    return (hasNonZeroAVL(MRI) && Other.hasNonZeroAVL(MRI));
+    return (hasNonZeroAVL() && Other.hasNonZeroAVL());
   }
 
   bool hasSameAVL(const VSETVLIInfo &Other) const {
     if (hasAVLReg() && Other.hasAVLReg())
-      return getAVLReg() == Other.getAVLReg();
+      return AVLRegDef.DefMI == Other.AVLRegDef.DefMI &&
+             AVLRegDef.DefReg == Other.AVLRegDef.DefReg;
 
     if (hasAVLImm() && Other.hasAVLImm())
       return getAVLImm() == Other.getAVLImm();
@@ -648,7 +665,7 @@ public:
     if (Used.VLAny && !(hasSameAVL(Require) && hasSameVLMAX(Require)))
       return false;
 
-    if (Used.VLZeroness && !hasEquallyZeroAVL(Require, MRI))
+    if (Used.VLZeroness && !hasEquallyZeroAVL(Require))
       return false;
 
     return hasCompatibleVTYPE(Used, Require);
@@ -733,7 +750,7 @@ public:
     if (isUnknown())
       OS << "unknown";
     if (hasAVLReg())
-      OS << "AVLReg=" << (unsigned)AVLReg;
+      OS << "AVLReg=" << (unsigned)getAVLReg();
     if (hasAVLImm())
       OS << "AVLImm=" << (unsigned)AVLImm;
     if (hasAVLVLMAX())
@@ -859,7 +876,8 @@ INITIALIZE_PASS(RISCVCoalesceVSETVLI, "riscv-coalesce-vsetvli",
 
 // Return a VSETVLIInfo representing the changes made by this VSETVLI or
 // VSETIVLI instruction.
-static VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) {
+static VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI,
+                                     const MachineRegisterInfo &MRI) {
   VSETVLIInfo NewInfo;
   if (MI.getOpcode() == RISCV::PseudoVSETIVLI) {
     NewInfo.setAVLImm(MI.getOperand(1).getImm());
@@ -872,7 +890,7 @@ static VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) {
     if (AVLReg == RISCV::X0)
       NewInfo.setAVLVLMAX();
     else
-      NewInfo.setAVLReg(AVLReg);
+      NewInfo.setAVLRegDef(MRI.getVRegDef(AVLReg), AVLReg);
   }
   NewInfo.setVTYPE(MI.getOperand(2).getImm());
 
@@ -944,7 +962,7 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
       else
         InstrInfo.setAVLImm(Imm);
     } else {
-      InstrInfo.setAVLReg(VLOp.getReg());
+      InstrInfo.setAVLRegDef(MRI->getVRegDef(VLOp.getReg()), VLOp.getReg());
     }
   } else {
     assert(isScalarExtractInstr(MI));
@@ -965,10 +983,9 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
   // register AVLs to avoid extending live ranges without being sure we can
   // kill the original source reg entirely.
   if (InstrInfo.hasAVLReg()) {
-    MachineInstr *DefMI = MRI->getUniqueVRegDef(InstrInfo.getAVLReg());
-    assert(DefMI);
-    if (isVectorConfigInstr(*DefMI)) {
-      VSETVLIInfo DefInstrInfo = getInfoForVSETVLI(*DefMI);
+    const MachineInstr &DefMI = InstrInfo.getAVLDefMI();
+    if (isVectorConfigInstr(DefMI)) {
+      VSETVLIInfo DefInstrInfo = getInfoForVSETVLI(DefMI, *MRI);
       if (DefInstrInfo.hasSameVLMAX(InstrInfo) &&
           (DefInstrInfo.hasAVLImm() || DefInstrInfo.hasAVLVLMAX()))
         InstrInfo.setAVL(DefInstrInfo);
@@ -1006,10 +1023,9 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
     // it has the same VLMAX we want and the last VL/VTYPE we observed is the
     // same, we can use the X0, X0 form.
     if (Info.hasSameVLMAX(PrevInfo) && Info.hasAVLReg()) {
-      MachineInstr *DefMI = MRI->getUniqueVRegDef(Info.getAVLReg());
-      assert(DefMI);
-      if (isVectorConfigInstr(*DefMI)) {
-        VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
+      const MachineInstr &DefMI = Info.getAVLDefMI();
+      if (isVectorConfigInstr(DefMI)) {
+        VSETVLIInfo DefInfo = getInfoForVSETVLI(DefMI, *MRI);
         if (DefInfo.hasSameAVL(PrevInfo) && DefInfo.hasSameVLMAX(PrevInfo)) {
           BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoVSETVLIX0))
               .addReg(RISCV::X0, RegState::Define | RegState::Dead)
@@ -1125,10 +1141,9 @@ bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
   // and the last VL/VTYPE we observed is the same, we don't need a
   // VSETVLI here.
   if (Require.hasAVLReg() && CurInfo.hasCompatibleVTYPE(Used, Require)) {
-    MachineInstr *DefMI = MRI->getUniqueVRegDef(Require.getAVLReg());
-    assert(DefMI);
-    if (isVectorConfigInstr(*DefMI)) {
-      VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
+    const MachineInstr &DefMI = Require.getAVLDefMI();
+    if (isVectorConfigInstr(DefMI)) {
+      VSETVLIInfo DefInfo = getInfoForVSETVLI(DefMI, *MRI);
       if (DefInfo.hasSameAVL(CurInfo) && DefInfo.hasSameVLMAX(CurInfo))
         return false;
     }
@@ -1183,7 +1198,7 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
   // variant, so we avoid the transform to prevent extending live range of an
   // avl register operand.
   // TODO: We can probably relax this for immediates.
-  bool EquallyZero = IncomingInfo.hasEquallyZeroAVL(PrevInfo, *MRI) &&
+  bool EquallyZero = IncomingInfo.hasEquallyZeroAVL(PrevInfo) &&
                      IncomingInfo.hasSameVLMAX(PrevInfo);
   if (Demanded.VLAny || (Demanded.VLZeroness && !EquallyZero))
     Info.setAVL(IncomingInfo);
@@ -1214,13 +1229,14 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
 void RISCVInsertVSETVLI::transferAfter(VSETVLIInfo &Info,
                                        const MachineInstr &MI) const {
   if (isVectorConfigInstr(MI)) {
-    Info = getInfoForVSETVLI(MI);
+    Info = getInfoForVSETVLI(MI, *MRI);
     return;
   }
 
   if (RISCV::isFaultFirstLoad(MI)) {
     // Update AVL to vl-output of the fault first load.
-    Info.setAVLReg(MI.getOperand(1).getReg());
+    Info.setAVLRegDef(MRI->getVRegDef(MI.getOperand(1).getReg()),
+                      MI.getOperand(1).getReg());
     return;
   }
 
@@ -1314,11 +1330,8 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
   if (!Require.hasAVLReg())
     return true;
 
-  Register AVLReg = Require.getAVLReg();
-
   // We need the AVL to be produce by a PHI node in this basic block.
-  MachineInstr *PHI = MRI->getUniqueVRegDef(AVLReg);
-  assert(PHI);
+  const MachineInstr *PHI = &Require.getAVLDefMI();
   if (PHI->getOpcode() != RISCV::PHI || PHI->getParent() != &MBB)
     return true;
 
@@ -1326,11 +1339,7 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
        PHIOp += 2) {
     Register InReg = PHI->getOperand(PHIOp).getReg();
     MachineBasicBlock *PBB = PHI->getOperand(PHIOp + 1).getMBB();
-    const BlockData &PBBInfo = BlockInfo[PBB->getNumber()];
-    // If the exit from the predecessor has the VTYPE we are looking for
-    // we might be able to avoid a VSETVLI.
-    if (PBBInfo.Exit.isUnknown() || !PBBInfo.Exit.hasSameVTYPE(Require))
-      return true;
+    const VSETVLIInfo &PBBExit = BlockInfo[PBB->getNumber()].Exit;
 
     // We need the PHI input to the be the output of a VSET(I)VLI.
     MachineInstr *DefMI = MRI->getVRegDef(InReg);
@@ -1339,9 +1348,14 @@ bool RISCVInsertVSETVLI::needVSETVLIPHI(const VSETVLIInfo &Require,
 
     // We found a VSET(I)VLI make sure it matches the output of the
     // predecessor block.
-    VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI);
-    if (!DefInfo.hasSameAVL(PBBInfo.Exit) ||
-        !DefInfo.hasSameVTYPE(PBBInfo.Exit))
+    VSETVLIInfo DefInfo = getInfoForVSETVLI(*DefMI, *MRI);
+    if (DefInfo != PBBExit)
+      return true;
+
+    // Require has the same VL as PBBExit, so if the exit from the
+    // predecessor has the VTYPE we are looking for we might be able
+    // to avoid a VSETVLI.
+    if (PBBExit.isUnknown() || !PBBExit.hasSameVTYPE(Require))
       return true;
   }
 
@@ -1488,8 +1502,7 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) {
   // we need to prove the value is available at the point we're going
   // to insert the vsetvli at.
   if (AvailableInfo.hasAVLReg()) {
-    MachineInstr *AVLDefMI = MRI->getUniqueVRegDef(AvailableInfo.getAVLReg());
-    assert(AVLDefMI);
+    const MachineInstr *AVLDefMI = &AvailableInfo.getAVLDefMI();
     // This is an inline dominance check which covers the case of
     // UnavailablePred being the preheader of a loop.
     if (AVLDefMI->getParent() != UnavailablePred)
@@ -1499,6 +1512,11 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) {
         return;
   }
 
+  // If the AVL isn't used in its predecessors then bail, since we have no AVL
+  // to insert a vsetvli with.
+  if (AvailableInfo.hasAVLIgnored())
+    return;
+
   // Model the effect of changing the input state of the block MBB to
   // AvailableInfo.  We're looking for two issues here; one legality,
   // one profitability.
@@ -1547,16 +1565,6 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) {
                 AvailableInfo, OldExit);
 }
 
-static void doUnion(DemandedFields &A, DemandedFields B) {
-  A.VLAny |= B.VLAny;
-  A.VLZeroness |= B.VLZeroness;
-  A.SEW = std::max(A.SEW, B.SEW);
-  A.LMUL |= B.LMUL;
-  A.SEWLMULRatio |= B.SEWLMULRatio;
-  A.TailPolicy |= B.TailPolicy;
-  A.MaskPolicy |= B.MaskPolicy;
-}
-
 // Return true if we can mutate PrevMI to match MI without changing any the
 // fields which would be observed.
 static bool canMutatePriorConfig(const MachineInstr &PrevMI,
@@ -1573,8 +1581,8 @@ static bool canMutatePriorConfig(const MachineInstr &PrevMI,
     if (Used.VLZeroness) {
       if (isVLPreservingConfig(PrevMI))
         return false;
-      if (!getInfoForVSETVLI(PrevMI).hasEquallyZeroAVL(getInfoForVSETVLI(MI),
-                                                       MRI))
+      if (!getInfoForVSETVLI(PrevMI, MRI)
+               .hasEquallyZeroAVL(getInfoForVSETVLI(MI, MRI)))
         return false;
     }
 
@@ -1606,7 +1614,7 @@ bool RISCVCoalesceVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) {
   for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
 
     if (!isVectorConfigInstr(MI)) {
-      doUnion(Used, getDemanded(MI, MRI, ST));
+      Used.doUnion(getDemanded(MI, MRI, ST));
       if (MI.isCall() || MI.isInlineAsm() ||
           MI.modifiesRegister(RISCV::VL, /*TRI=*/nullptr) ||
           MI.modifiesRegister(RISCV::VTYPE, /*TRI=*/nullptr))
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 5c1f154efa99..8cb9a40a98bc 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1633,8 +1633,230 @@ static bool isFMUL(unsigned Opc) {
   }
 }
 
+bool RISCVInstrInfo::isVectorAssociativeAndCommutative(const MachineInstr &Inst,
+                                                       bool Invert) const {
+#define OPCODE_LMUL_CASE(OPC)                                                  \
+  case RISCV::OPC##_M1:                                                        \
+  case RISCV::OPC##_M2:                                                        \
+  case RISCV::OPC##_M4:                                                        \
+  case RISCV::OPC##_M8:                                                        \
+  case RISCV::OPC##_MF2:                                                       \
+  case RISCV::OPC##_MF4:                                                       \
+  case RISCV::OPC##_MF8
+
+#define OPCODE_LMUL_MASK_CASE(OPC)                                             \
+  case RISCV::OPC##_M1_MASK:                                                   \
+  case RISCV::OPC##_M2_MASK:                                                   \
+  case RISCV::OPC##_M4_MASK:                                                   \
+  case RISCV::OPC##_M8_MASK:                                                   \
+  case RISCV::OPC##_MF2_MASK:                                                  \
+  case RISCV::OPC##_MF4_MASK:                                                  \
+  case RISCV::OPC##_MF8_MASK
+
+  unsigned Opcode = Inst.getOpcode();
+  if (Invert) {
+    if (auto InvOpcode = getInverseOpcode(Opcode))
+      Opcode = *InvOpcode;
+    else
+      return false;
+  }
+
+  // clang-format off
+  switch (Opcode) {
+  default:
+    return false;
+  OPCODE_LMUL_CASE(PseudoVADD_VV):
+  OPCODE_LMUL_MASK_CASE(PseudoVADD_VV):
+  OPCODE_LMUL_CASE(PseudoVMUL_VV):
+  OPCODE_LMUL_MASK_CASE(PseudoVMUL_VV):
+    return true;
+  }
+  // clang-format on
+
+#undef OPCODE_LMUL_MASK_CASE
+#undef OPCODE_LMUL_CASE
+}
+
+bool RISCVInstrInfo::areRVVInstsReassociable(const MachineInstr &Root,
+                                             const MachineInstr &Prev) const {
+  if (!areOpcodesEqualOrInverse(Root.getOpcode(), Prev.getOpcode()))
+    return false;
+
+  assert(Root.getMF() == Prev.getMF());
+  const MachineRegisterInfo *MRI = &Root.getMF()->getRegInfo();
+  const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+
+  // Make sure vtype operands are also the same.
+  const MCInstrDesc &Desc = get(Root.getOpcode());
+  const uint64_t TSFlags = Desc.TSFlags;
+
+  auto checkImmOperand = [&](unsigned OpIdx) {
+    return Root.getOperand(OpIdx).getImm() == Prev.getOperand(OpIdx).getImm();
+  };
+
+  auto checkRegOperand = [&](unsigned OpIdx) {
+    return Root.getOperand(OpIdx).getReg() == Prev.getOperand(OpIdx).getReg();
+  };
+
+  // PassThru
+  // TODO: Potentially we can loosen the condition to consider Root to be
+  // associable with Prev if Root has NoReg as passthru. In which case we
+  // also need to loosen the condition on vector policies between these.
+  if (!checkRegOperand(1))
+    return false;
+
+  // SEW
+  if (RISCVII::hasSEWOp(TSFlags) &&
+      !checkImmOperand(RISCVII::getSEWOpNum(Desc)))
+    return false;
+
+  // Mask
+  if (RISCVII::usesMaskPolicy(TSFlags)) {
+    const MachineBasicBlock *MBB = Root.getParent();
+    const MachineBasicBlock::const_reverse_iterator It1(&Root);
+    const MachineBasicBlock::const_reverse_iterator It2(&Prev);
+    Register MI1VReg;
+
+    bool SeenMI2 = false;
+    for (auto End = MBB->rend(), It = It1; It != End; ++It) {
+      if (It == It2) {
+        SeenMI2 = true;
+        if (!MI1VReg.isValid())
+          // There is no V0 def between Root and Prev; they're sharing the
+          // same V0.
+          break;
+      }
+
+      if (It->modifiesRegister(RISCV::V0, TRI)) {
+        Register SrcReg = It->getOperand(1).getReg();
+        // If it's not VReg it'll be more difficult to track its defs, so
+        // bailing out here just to be safe.
+        if (!SrcReg.isVirtual())
+          return false;
+
+        if (!MI1VReg.isValid()) {
+          // This is the V0 def for Root.
+          MI1VReg = SrcReg;
+          continue;
+        }
+
+        // Some random mask updates.
+        if (!SeenMI2)
+          continue;
+
+        // This is the V0 def for Prev; check if it's the same as that of
+        // Root.
+        if (MI1VReg != SrcReg)
+          return false;
+        else
+          break;
+      }
+    }
+
+    // If we haven't encountered Prev, it's likely that this function was
+    // called in a wrong way (e.g. Root is before Prev).
+    assert(SeenMI2 && "Prev is expected to appear before Root");
+  }
+
+  // Tail / Mask policies
+  if (RISCVII::hasVecPolicyOp(TSFlags) &&
+      !checkImmOperand(RISCVII::getVecPolicyOpNum(Desc)))
+    return false;
+
+  // VL
+  if (RISCVII::hasVLOp(TSFlags)) {
+    unsigned OpIdx = RISCVII::getVLOpNum(Desc);
+    const MachineOperand &Op1 = Root.getOperand(OpIdx);
+    const MachineOperand &Op2 = Prev.getOperand(OpIdx);
+    if (Op1.getType() != Op2.getType())
+      return false;
+    switch (Op1.getType()) {
+    case MachineOperand::MO_Register:
+      if (Op1.getReg() != Op2.getReg())
+        return false;
+      break;
+    case MachineOperand::MO_Immediate:
+      if (Op1.getImm() != Op2.getImm())
+        return false;
+      break;
+    default:
+      llvm_unreachable("Unrecognized VL operand type");
+    }
+  }
+
+  // Rounding modes
+  if (RISCVII::hasRoundModeOp(TSFlags) &&
+      !checkImmOperand(RISCVII::getVLOpNum(Desc) - 1))
+    return false;
+
+  return true;
+}
+
+// Most of our RVV pseudos have passthru operand, so the real operands
+// start from index = 2.
+bool RISCVInstrInfo::hasReassociableVectorSibling(const MachineInstr &Inst,
+                                                  bool &Commuted) const {
+  const MachineBasicBlock *MBB = Inst.getParent();
+  const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  assert(RISCVII::isFirstDefTiedToFirstUse(get(Inst.getOpcode())) &&
+         "Expect the present of passthrough operand.");
+  MachineInstr *MI1 = MRI.getUniqueVRegDef(Inst.getOperand(2).getReg());
+  MachineInstr *MI2 = MRI.getUniqueVRegDef(Inst.getOperand(3).getReg());
+
+  // If only one operand has the same or inverse opcode and it's the second
+  // source operand, the operands must be commuted.
+  Commuted = !areRVVInstsReassociable(Inst, *MI1) &&
+             areRVVInstsReassociable(Inst, *MI2);
+  if (Commuted)
+    std::swap(MI1, MI2);
+
+  return areRVVInstsReassociable(Inst, *MI1) &&
+         (isVectorAssociativeAndCommutative(*MI1) ||
+          isVectorAssociativeAndCommutative(*MI1, /* Invert */ true)) &&
+         hasReassociableOperands(*MI1, MBB) &&
+         MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg());
+}
+
+bool RISCVInstrInfo::hasReassociableOperands(
+    const MachineInstr &Inst, const MachineBasicBlock *MBB) const {
+  if (!isVectorAssociativeAndCommutative(Inst) &&
+      !isVectorAssociativeAndCommutative(Inst, /*Invert=*/true))
+    return TargetInstrInfo::hasReassociableOperands(Inst, MBB);
+
+  const MachineOperand &Op1 = Inst.getOperand(2);
+  const MachineOperand &Op2 = Inst.getOperand(3);
+  const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+  // We need virtual register definitions for the operands that we will
+  // reassociate.
+  MachineInstr *MI1 = nullptr;
+  MachineInstr *MI2 = nullptr;
+  if (Op1.isReg() && Op1.getReg().isVirtual())
+    MI1 = MRI.getUniqueVRegDef(Op1.getReg());
+  if (Op2.isReg() && Op2.getReg().isVirtual())
+    MI2 = MRI.getUniqueVRegDef(Op2.getReg());
+
+  // And at least one operand must be defined in MBB.
+  return MI1 && MI2 && (MI1->getParent() == MBB || MI2->getParent() == MBB);
+}
+
+void RISCVInstrInfo::getReassociateOperandIndices(
+    const MachineInstr &Root, unsigned Pattern,
+    std::array<unsigned, 5> &OperandIndices) const {
+  TargetInstrInfo::getReassociateOperandIndices(Root, Pattern, OperandIndices);
+  if (RISCV::getRVVMCOpcode(Root.getOpcode())) {
+    // Skip the passthrough operand, so increment all indices by one.
+    for (unsigned I = 0; I < 5; ++I)
+      ++OperandIndices[I];
+  }
+}
+
 bool RISCVInstrInfo::hasReassociableSibling(const MachineInstr &Inst,
                                             bool &Commuted) const {
+  if (isVectorAssociativeAndCommutative(Inst) ||
+      isVectorAssociativeAndCommutative(Inst, /*Invert=*/true))
+    return hasReassociableVectorSibling(Inst, Commuted);
+
   if (!TargetInstrInfo::hasReassociableSibling(Inst, Commuted))
     return false;
 
@@ -1654,6 +1876,9 @@ bool RISCVInstrInfo::hasReassociableSibling(const MachineInstr &Inst,
 
 bool RISCVInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
                                                  bool Invert) const {
+  if (isVectorAssociativeAndCommutative(Inst, Invert))
+    return true;
+
   unsigned Opc = Inst.getOpcode();
   if (Invert) {
     auto InverseOpcode = getInverseOpcode(Opc);
@@ -1706,6 +1931,38 @@ bool RISCVInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
 
 std::optional<unsigned>
 RISCVInstrInfo::getInverseOpcode(unsigned Opcode) const {
+#define RVV_OPC_LMUL_CASE(OPC, INV)                                            \
+  case RISCV::OPC##_M1:                                                        \
+    return RISCV::INV##_M1;                                                    \
+  case RISCV::OPC##_M2:                                                        \
+    return RISCV::INV##_M2;                                                    \
+  case RISCV::OPC##_M4:                                                        \
+    return RISCV::INV##_M4;                                                    \
+  case RISCV::OPC##_M8:                                                        \
+    return RISCV::INV##_M8;                                                    \
+  case RISCV::OPC##_MF2:                                                       \
+    return RISCV::INV##_MF2;                                                   \
+  case RISCV::OPC##_MF4:                                                       \
+    return RISCV::INV##_MF4;                                                   \
+  case RISCV::OPC##_MF8:                                                       \
+    return RISCV::INV##_MF8
+
+#define RVV_OPC_LMUL_MASK_CASE(OPC, INV)                                       \
+  case RISCV::OPC##_M1_MASK:                                                   \
+    return RISCV::INV##_M1_MASK;                                               \
+  case RISCV::OPC##_M2_MASK:                                                   \
+    return RISCV::INV##_M2_MASK;                                               \
+  case RISCV::OPC##_M4_MASK:                                                   \
+    return RISCV::INV##_M4_MASK;                                               \
+  case RISCV::OPC##_M8_MASK:                                                   \
+    return RISCV::INV##_M8_MASK;                                               \
+  case RISCV::OPC##_MF2_MASK:                                                  \
+    return RISCV::INV##_MF2_MASK;                                              \
+  case RISCV::OPC##_MF4_MASK:                                                  \
+    return RISCV::INV##_MF4_MASK;                                              \
+  case RISCV::OPC##_MF8_MASK:                                                  \
+    return RISCV::INV##_MF8_MASK
+
   switch (Opcode) {
   default:
     return std::nullopt;
@@ -1729,7 +1986,16 @@ RISCVInstrInfo::getInverseOpcode(unsigned Opcode) const {
     return RISCV::SUBW;
   case RISCV::SUBW:
     return RISCV::ADDW;
+    // clang-format off
+  RVV_OPC_LMUL_CASE(PseudoVADD_VV, PseudoVSUB_VV);
+  RVV_OPC_LMUL_MASK_CASE(PseudoVADD_VV, PseudoVSUB_VV);
+  RVV_OPC_LMUL_CASE(PseudoVSUB_VV, PseudoVADD_VV);
+  RVV_OPC_LMUL_MASK_CASE(PseudoVSUB_VV, PseudoVADD_VV);
+    // clang-format on
   }
+
+#undef RVV_OPC_LMUL_MASK_CASE
+#undef RVV_OPC_LMUL_CASE
 }
 
 static bool canCombineFPFusedMultiply(const MachineInstr &Root,
@@ -2866,6 +3132,11 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   case CASE_RVV_OPCODE_WIDEN(VWMACC_VV):
   case CASE_RVV_OPCODE_WIDEN(VWMACCU_VV):
   case CASE_RVV_OPCODE_UNMASK(VADC_VVM):
+  case CASE_RVV_OPCODE(VSADD_VV):
+  case CASE_RVV_OPCODE(VSADDU_VV):
+  case CASE_RVV_OPCODE(VAADD_VV):
+  case CASE_RVV_OPCODE(VAADDU_VV):
+  case CASE_RVV_OPCODE(VSMUL_VV):
     // Operands 2 and 3 are commutable.
     return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3);
   case CASE_VFMA_SPLATS(FMADD):
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 3b03d5efde6e..170f813eb10d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -266,6 +266,9 @@ public:
       SmallVectorImpl<MachineInstr *> &DelInstrs,
       DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override;
 
+  bool hasReassociableOperands(const MachineInstr &Inst,
+                               const MachineBasicBlock *MBB) const override;
+
   bool hasReassociableSibling(const MachineInstr &Inst,
                               bool &Commuted) const override;
 
@@ -274,6 +277,10 @@ public:
 
   std::optional<unsigned> getInverseOpcode(unsigned Opcode) const override;
 
+  void getReassociateOperandIndices(
+      const MachineInstr &Root, unsigned Pattern,
+      std::array<unsigned, 5> &OperandIndices) const override;
+
   ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
   getSerializableMachineMemOperandTargetFlags() const override;
 
@@ -297,6 +304,13 @@ protected:
 
 private:
   unsigned getInstBundleLength(const MachineInstr &MI) const;
+
+  bool isVectorAssociativeAndCommutative(const MachineInstr &MI,
+                                         bool Invert = false) const;
+  bool areRVVInstsReassociable(const MachineInstr &MI1,
+                               const MachineInstr &MI2) const;
+  bool hasReassociableVectorSibling(const MachineInstr &Inst,
+                                    bool &Commuted) const;
 };
 
 namespace RISCV {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
index 18d38348f721..f4e50d7aa45c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -841,7 +841,7 @@ def : InstAlias<".insn_cj $opcode, $funct3, $imm11",
 //===----------------------------------------------------------------------===//
 
 // Patterns are defined in the same order the compressed instructions appear
-// on page 82 of the ISA manual.
+// under the "RVC Instruction Set Listings" section of the ISA manual.
 
 // Quadrant 0
 let Predicates = [HasStdExtCOrZca] in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index e9715b40adc0..fc60a9cc7cd3 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -2146,8 +2146,9 @@ multiclass VPseudoBinaryRoundingMode<VReg RetClass,
                                      string Constraint = "",
                                      int sew = 0,
                                      int UsesVXRM = 1,
-                                     int TargetConstraintType = 1> {
-  let VLMul = MInfo.value, SEW=sew in {
+                                     int TargetConstraintType = 1,
+                                     bit Commutable = 0> {
+  let VLMul = MInfo.value, SEW=sew, isCommutable = Commutable in {
     defvar suffix = !if(sew, "_" # MInfo.MX # "_E" # sew, "_" # MInfo.MX);
     def suffix : VPseudoBinaryNoMaskRoundingMode<RetClass, Op1Class, Op2Class,
                                                  Constraint, UsesVXRM,
@@ -2232,8 +2233,9 @@ multiclass VPseudoBinaryV_VV<LMULInfo m, string Constraint = "", int sew = 0, bi
   defm _VV : VPseudoBinary<m.vrclass, m.vrclass, m.vrclass, m, Constraint, sew, Commutable=Commutable>;
 }
 
-multiclass VPseudoBinaryV_VV_RM<LMULInfo m, string Constraint = ""> {
-  defm _VV : VPseudoBinaryRoundingMode<m.vrclass, m.vrclass, m.vrclass, m, Constraint>;
+multiclass VPseudoBinaryV_VV_RM<LMULInfo m, string Constraint = "", bit Commutable = 0> {
+  defm _VV : VPseudoBinaryRoundingMode<m.vrclass, m.vrclass, m.vrclass, m, Constraint,
+                                       Commutable=Commutable>;
 }
 
 // Similar to VPseudoBinaryV_VV, but uses MxListF.
@@ -2715,10 +2717,11 @@ multiclass VPseudoVGTR_VV_VX_VI<Operand ImmType = simm5, string Constraint = "">
   }
 }
 
-multiclass VPseudoVSALU_VV_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
+multiclass VPseudoVSALU_VV_VX_VI<Operand ImmType = simm5, string Constraint = "",
+                                 bit Commutable = 0> {
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VV<m, Constraint>,
+    defm "" : VPseudoBinaryV_VV<m, Constraint, Commutable=Commutable>,
               SchedBinary<"WriteVSALUV", "ReadVSALUV", "ReadVSALUX", mx,
                           forceMergeOpRead=true>;
     defm "" : VPseudoBinaryV_VX<m, Constraint>,
@@ -2788,7 +2791,7 @@ multiclass VPseudoVSALU_VV_VX {
 multiclass VPseudoVSMUL_VV_VX_RM {
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VV_RM<m>,
+    defm "" : VPseudoBinaryV_VV_RM<m, Commutable=1>,
               SchedBinary<"WriteVSMulV", "ReadVSMulV", "ReadVSMulV", mx,
                           forceMergeOpRead=true>;
     defm "" : VPseudoBinaryV_VX_RM<m>,
@@ -2797,10 +2800,10 @@ multiclass VPseudoVSMUL_VV_VX_RM {
   }
 }
 
-multiclass VPseudoVAALU_VV_VX_RM {
+multiclass VPseudoVAALU_VV_VX_RM<bit Commutable = 0> {
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VV_RM<m>,
+    defm "" : VPseudoBinaryV_VV_RM<m, Commutable=Commutable>,
               SchedBinary<"WriteVAALUV", "ReadVAALUV", "ReadVAALUV", mx,
                           forceMergeOpRead=true>;
     defm "" : VPseudoBinaryV_VX_RM<m>,
@@ -6448,8 +6451,8 @@ defm PseudoVMV_V : VPseudoUnaryVMV_V_X_I;
 // 12.1. Vector Single-Width Saturating Add and Subtract
 //===----------------------------------------------------------------------===//
 let Defs = [VXSAT], hasSideEffects = 1 in {
-  defm PseudoVSADDU : VPseudoVSALU_VV_VX_VI;
-  defm PseudoVSADD  : VPseudoVSALU_VV_VX_VI;
+  defm PseudoVSADDU : VPseudoVSALU_VV_VX_VI<Commutable=1>;
+  defm PseudoVSADD  : VPseudoVSALU_VV_VX_VI<Commutable=1>;
   defm PseudoVSSUBU : VPseudoVSALU_VV_VX;
   defm PseudoVSSUB  : VPseudoVSALU_VV_VX;
 }
@@ -6457,8 +6460,8 @@ let Defs = [VXSAT], hasSideEffects = 1 in {
 //===----------------------------------------------------------------------===//
 // 12.2. Vector Single-Width Averaging Add and Subtract
 //===----------------------------------------------------------------------===//
-defm PseudoVAADDU : VPseudoVAALU_VV_VX_RM;
-defm PseudoVAADD  : VPseudoVAALU_VV_VX_RM;
+defm PseudoVAADDU : VPseudoVAALU_VV_VX_RM<Commutable=1>;
+defm PseudoVAADD  : VPseudoVAALU_VV_VX_RM<Commutable=1>;
 defm PseudoVASUBU : VPseudoVAALU_VV_VX_RM;
 defm PseudoVASUB  : VPseudoVAALU_VV_VX_RM;
 
diff --git a/llvm/lib/Target/RISCV/RISCVProfiles.td b/llvm/lib/Target/RISCV/RISCVProfiles.td
new file mode 100644
index 000000000000..5c13710faf65
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVProfiles.td
@@ -0,0 +1,204 @@
+//===------ RISCVProfiles.td - RISC-V Profiles -------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+class RISCVProfile<string name, list<SubtargetFeature> features>
+    : SubtargetFeature<name, "Is" # NAME, "true",
+                       "RISC-V " # name # " profile", features>;
+
+defvar RVI20U32Features = [Feature32Bit, FeatureStdExtI];
+defvar RVI20U64Features = [Feature64Bit, FeatureStdExtI];
+
+defvar RVA20U64Features = [Feature64Bit,
+                           FeatureStdExtI,
+                           FeatureStdExtM,
+                           FeatureStdExtA,
+                           FeatureStdExtF,
+                           FeatureStdExtD,
+                           FeatureStdExtC,
+                           FeatureStdExtZicntr,
+                           FeatureStdExtZiccif,
+                           FeatureStdExtZiccrse,
+                           FeatureStdExtZiccamoa,
+                           FeatureStdExtZa128rs,
+                           FeatureStdExtZicclsm];
+
+defvar RVA20S64Features = !listconcat(RVA20U64Features,
+                                      [FeatureStdExtZifencei,
+                                       FeatureStdExtSvbare,
+                                       FeatureStdExtSvade,
+                                       FeatureStdExtSsccptr,
+                                       FeatureStdExtSstvecd,
+                                       FeatureStdExtSstvala]);
+
+defvar RVA22U64Features = [Feature64Bit,
+                           FeatureStdExtI,
+                           FeatureStdExtM,
+                           FeatureStdExtA,
+                           FeatureStdExtF,
+                           FeatureStdExtD,
+                           FeatureStdExtC,
+                           FeatureStdExtZicntr,
+                           FeatureStdExtZiccif,
+                           FeatureStdExtZiccrse,
+                           FeatureStdExtZiccamoa,
+                           FeatureStdExtZicclsm,
+                           FeatureStdExtZa64rs,
+                           FeatureStdExtZihpm,
+                           FeatureStdExtZihintpause,
+                           FeatureStdExtZba,
+                           FeatureStdExtZbb,
+                           FeatureStdExtZbs,
+                           FeatureStdExtZic64b,
+                           FeatureStdExtZicbom,
+                           FeatureStdExtZicbop,
+                           FeatureStdExtZicboz,
+                           FeatureStdExtZfhmin,
+                           FeatureStdExtZkt];
+
+defvar RVA22S64Features = !listconcat(RVA22U64Features,
+                                      [FeatureStdExtZifencei,
+                                       FeatureStdExtSvbare,
+                                       FeatureStdExtSvade,
+                                       FeatureStdExtSsccptr,
+                                       FeatureStdExtSstvecd,
+                                       FeatureStdExtSstvala,
+                                       FeatureStdExtSscounterenw,
+                                       FeatureStdExtSvpbmt,
+                                       FeatureStdExtSvinval]);
+
+defvar RVA23U64Features = [Feature64Bit,
+                           FeatureStdExtI,
+                           FeatureStdExtM,
+                           FeatureStdExtA,
+                           FeatureStdExtF,
+                           FeatureStdExtD,
+                           FeatureStdExtC,
+                           FeatureStdExtZicntr,
+                           FeatureStdExtZihpm,
+                           FeatureStdExtZiccif,
+                           FeatureStdExtZiccrse,
+                           FeatureStdExtZiccamoa,
+                           FeatureStdExtZicclsm,
+                           FeatureStdExtZa64rs,
+                           FeatureStdExtZihintpause,
+                           FeatureStdExtZba,
+                           FeatureStdExtZbb,
+                           FeatureStdExtZbs,
+                           FeatureStdExtZic64b,
+                           FeatureStdExtZicbom,
+                           FeatureStdExtZicbop,
+                           FeatureStdExtZicboz,
+                           FeatureStdExtZfhmin,
+                           FeatureStdExtZkt,
+                           FeatureStdExtV,
+                           FeatureStdExtZvfhmin,
+                           FeatureStdExtZvbb,
+                           FeatureStdExtZvkt,
+                           FeatureStdExtZihintntl,
+                           FeatureStdExtZicond,
+                           FeatureStdExtZimop,
+                           FeatureStdExtZcmop,
+                           FeatureStdExtZcb,
+                           FeatureStdExtZfa,
+                           FeatureStdExtZawrs];
+
+defvar RVA23S64Features = !listconcat(RVA23U64Features,
+                                      [FeatureStdExtZifencei,
+                                       FeatureStdExtSvbare,
+                                       FeatureStdExtSvade,
+                                       FeatureStdExtSsccptr,
+                                       FeatureStdExtSstvecd,
+                                       FeatureStdExtSstvala,
+                                       FeatureStdExtSscounterenw,
+                                       FeatureStdExtSvpbmt,
+                                       FeatureStdExtSvinval,
+                                       FeatureStdExtSvnapot,
+                                       FeatureStdExtSstc,
+                                       FeatureStdExtSscofpmf,
+                                       FeatureStdExtSsnpm,
+                                       FeatureStdExtSsu64xl,
+                                       FeatureStdExtH,
+                                       FeatureStdExtSsstateen,
+                                       FeatureStdExtShcounterenw,
+                                       FeatureStdExtShvstvala,
+                                       FeatureStdExtShtvala,
+                                       FeatureStdExtShvstvecd,
+                                       FeatureStdExtShvsatpa,
+                                       FeatureStdExtShgatpa]);
+
+defvar RVB23U64Features = [Feature64Bit,
+                           FeatureStdExtI,
+                           FeatureStdExtM,
+                           FeatureStdExtA,
+                           FeatureStdExtF,
+                           FeatureStdExtD,
+                           FeatureStdExtC,
+                           FeatureStdExtZicntr,
+                           FeatureStdExtZihpm,
+                           FeatureStdExtZiccif,
+                           FeatureStdExtZiccrse,
+                           FeatureStdExtZiccamoa,
+                           FeatureStdExtZicclsm,
+                           FeatureStdExtZa64rs,
+                           FeatureStdExtZihintpause,
+                           FeatureStdExtZba,
+                           FeatureStdExtZbb,
+                           FeatureStdExtZbs,
+                           FeatureStdExtZic64b,
+                           FeatureStdExtZicbom,
+                           FeatureStdExtZicbop,
+                           FeatureStdExtZicboz,
+                           FeatureStdExtZkt,
+                           FeatureStdExtZihintntl,
+                           FeatureStdExtZicond,
+                           FeatureStdExtZimop,
+                           FeatureStdExtZcmop,
+                           FeatureStdExtZcb,
+                           FeatureStdExtZfa,
+                           FeatureStdExtZawrs];
+
+defvar RVB23S64Features = !listconcat(RVB23U64Features,
+                                      [FeatureStdExtZifencei,
+                                       FeatureStdExtSvnapot,
+                                       FeatureStdExtSvbare,
+                                       FeatureStdExtSvade,
+                                       FeatureStdExtSsccptr,
+                                       FeatureStdExtSstvecd,
+                                       FeatureStdExtSstvala,
+                                       FeatureStdExtSscounterenw,
+                                       FeatureStdExtSvpbmt,
+                                       FeatureStdExtSvinval,
+                                       FeatureStdExtSstc,
+                                       FeatureStdExtSscofpmf,
+                                       FeatureStdExtSsu64xl]);
+
+defvar RVM23U32Features = [Feature32Bit,
+                           FeatureStdExtI,
+                           FeatureStdExtM,
+                           FeatureStdExtZba,
+                           FeatureStdExtZbb,
+                           FeatureStdExtZbs,
+                           FeatureStdExtZicond,
+                           FeatureStdExtZihintpause,
+                           FeatureStdExtZihintntl,
+                           FeatureStdExtZce,
+                           FeatureStdExtZicbop,
+                           FeatureStdExtZimop,
+                           FeatureStdExtZcmop];
+
+def RVI20U32 : RISCVProfile<"rvi20u32", RVI20U32Features>;
+def RVI20U64 : RISCVProfile<"rvi20u64", RVI20U64Features>;
+def RVA20U64 : RISCVProfile<"rva20u64", RVA20U64Features>;
+def RVA20S64 : RISCVProfile<"rva20s64", RVA20S64Features>;
+def RVA22U64 : RISCVProfile<"rva22u64", RVA22U64Features>;
+def RVA22S64 : RISCVProfile<"rva22s64", RVA22S64Features>;
+def RVA23U64 : RISCVProfile<"rva23u64", RVA23U64Features>;
+def RVA23S64 : RISCVProfile<"rva23s64", RVA23S64Features>;
+def RVB23U64 : RISCVProfile<"rvb23u64", RVB23U64Features>;
+def RVB23S64 : RISCVProfile<"rvb23s64", RVB23S64Features>;
+def RVM23U32 : RISCVProfile<"rvm23u32", RVM23U32Features>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 4b07d7e61fa1..7439d0fefa98 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -189,6 +189,10 @@ lookupBuiltin(StringRef DemangledCall,
   std::string BuiltinName =
       DemangledCall.substr(0, DemangledCall.find('(')).str();
 
+  // Account for possible "__spirv_ocl_" prefix in SPIR-V friendly LLVM IR
+  if (BuiltinName.rfind("__spirv_ocl_", 0) == 0)
+    BuiltinName = BuiltinName.substr(12);
+
   // Check if the extracted name contains type information between angle
   // brackets. If so, the builtin is an instantiated template - needs to have
   // the information after angle brackets and return type removed.
@@ -2008,6 +2012,13 @@ static bool generateAsyncCopy(const SPIRV::IncomingCall *Call,
   const SPIRV::DemangledBuiltin *Builtin = Call->Builtin;
   unsigned Opcode =
       SPIRV::lookupNativeBuiltin(Builtin->Name, Builtin->Set)->Opcode;
+
+  bool IsSet = Opcode == SPIRV::OpGroupAsyncCopy;
+  Register TypeReg = GR->getSPIRVTypeID(Call->ReturnType);
+  if (Call->isSpirvOp())
+    return buildOpFromWrapper(MIRBuilder, Opcode, Call,
+                              IsSet ? TypeReg : Register(0));
+
   auto Scope = buildConstantIntReg(SPIRV::Scope::Workgroup, MIRBuilder, GR);
 
   switch (Opcode) {
@@ -2306,7 +2317,7 @@ Type *parseBuiltinCallArgumentBaseType(const StringRef DemangledCall,
     // parseBuiltinCallArgumentBaseType(...) as this function only retrieves the
     // base types.
     if (TypeStr.ends_with("*"))
-      TypeStr = TypeStr.slice(0, TypeStr.find_first_of(" "));
+      TypeStr = TypeStr.slice(0, TypeStr.find_first_of(" *"));
 
     return parseBuiltinTypeNameToTargetExtType("opencl." + TypeStr.str() + "_t",
                                                Ctx);
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 660000fb548d..564028547821 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -585,9 +585,9 @@ defm : DemangledNativeBuiltin<"__spirv_SpecConstantComposite", OpenCL_std, SpecC
 
 // Async Copy and Prefetch builtin records:
 defm : DemangledNativeBuiltin<"async_work_group_copy", OpenCL_std, AsyncCopy, 4, 4, OpGroupAsyncCopy>;
-defm : DemangledNativeBuiltin<"__spirv_GroupAsyncCopy", OpenCL_std, AsyncCopy, 4, 4, OpGroupAsyncCopy>;
+defm : DemangledNativeBuiltin<"__spirv_GroupAsyncCopy", OpenCL_std, AsyncCopy, 6, 6, OpGroupAsyncCopy>;
 defm : DemangledNativeBuiltin<"wait_group_events", OpenCL_std, AsyncCopy, 2, 2, OpGroupWaitEvents>;
-defm : DemangledNativeBuiltin<"__spirv_GroupWaitEvents", OpenCL_std, AsyncCopy, 2, 2, OpGroupWaitEvents>;
+defm : DemangledNativeBuiltin<"__spirv_GroupWaitEvents", OpenCL_std, AsyncCopy, 3, 3, OpGroupWaitEvents>;
 
 // Load and store builtin records:
 defm : DemangledNativeBuiltin<"__spirv_Load", OpenCL_std, LoadStore, 1, 3, OpLoad>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 472bc8638c9a..0d539b1ed9a8 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -98,6 +98,8 @@ class SPIRVEmitIntrinsics
     return B.CreateIntrinsic(IntrID, {Types}, Args);
   }
 
+  void buildAssignPtr(IRBuilder<> &B, Type *ElemTy, Value *Arg);
+
   void replaceMemInstrUses(Instruction *Old, Instruction *New, IRBuilder<> &B);
   void processInstrAfterVisit(Instruction *I, IRBuilder<> &B);
   void insertAssignPtrTypeIntrs(Instruction *I, IRBuilder<> &B);
@@ -111,6 +113,7 @@ class SPIRVEmitIntrinsics
   void insertPtrCastOrAssignTypeInstr(Instruction *I, IRBuilder<> &B);
   void processGlobalValue(GlobalVariable &GV, IRBuilder<> &B);
   void processParamTypes(Function *F, IRBuilder<> &B);
+  void processParamTypesByFunHeader(Function *F, IRBuilder<> &B);
   Type *deduceFunParamElementType(Function *F, unsigned OpIdx);
   Type *deduceFunParamElementType(Function *F, unsigned OpIdx,
                                   std::unordered_set<Function *> &FVisited);
@@ -194,6 +197,17 @@ static inline void reportFatalOnTokenType(const Instruction *I) {
                        false);
 }
 
+void SPIRVEmitIntrinsics::buildAssignPtr(IRBuilder<> &B, Type *ElemTy,
+                                         Value *Arg) {
+  CallInst *AssignPtrTyCI =
+      buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {Arg->getType()},
+                      Constant::getNullValue(ElemTy), Arg,
+                      {B.getInt32(getPointerAddressSpace(Arg->getType()))}, B);
+  GR->addDeducedElementType(AssignPtrTyCI, ElemTy);
+  GR->addDeducedElementType(Arg, ElemTy);
+  AssignPtrTypeInstr[Arg] = AssignPtrTyCI;
+}
+
 // Set element pointer type to the given value of ValueTy and tries to
 // specify this type further (recursively) by Operand value, if needed.
 Type *SPIRVEmitIntrinsics::deduceElementTypeByValueDeep(
@@ -232,6 +246,19 @@ Type *SPIRVEmitIntrinsics::deduceElementTypeByUsersDeep(
   return nullptr;
 }
 
+// Implements what we know in advance about intrinsics and builtin calls
+// TODO: consider feasibility of this particular case to be generalized by
+// encoding knowledge about intrinsics and builtin calls by corresponding
+// specification rules
+static Type *getPointeeTypeByCallInst(StringRef DemangledName,
+                                      Function *CalledF, unsigned OpIdx) {
+  if ((DemangledName.starts_with("__spirv_ocl_printf(") ||
+       DemangledName.starts_with("printf(")) &&
+      OpIdx == 0)
+    return IntegerType::getInt8Ty(CalledF->getContext());
+  return nullptr;
+}
+
 // Deduce and return a successfully deduced Type of the Instruction,
 // or nullptr otherwise.
 Type *SPIRVEmitIntrinsics::deduceElementTypeHelper(Value *I) {
@@ -795,6 +822,8 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I,
     return;
 
   // collect information about formal parameter types
+  std::string DemangledName =
+      getOclOrSpirvBuiltinDemangledName(CI->getCalledFunction()->getName());
   Function *CalledF = CI->getCalledFunction();
   SmallVector<Type *, 4> CalledArgTys;
   bool HaveTypes = false;
@@ -811,10 +840,15 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I,
       if (!ElemTy && hasPointeeTypeAttr(CalledArg))
         ElemTy = getPointeeTypeByAttr(CalledArg);
       if (!ElemTy) {
-        for (User *U : CalledArg->users()) {
-          if (Instruction *Inst = dyn_cast<Instruction>(U)) {
-            if ((ElemTy = deduceElementTypeHelper(Inst)) != nullptr)
-              break;
+        ElemTy = getPointeeTypeByCallInst(DemangledName, CalledF, OpIdx);
+        if (ElemTy) {
+          GR->addDeducedElementType(CalledArg, ElemTy);
+        } else {
+          for (User *U : CalledArg->users()) {
+            if (Instruction *Inst = dyn_cast<Instruction>(U)) {
+              if ((ElemTy = deduceElementTypeHelper(Inst)) != nullptr)
+                break;
+            }
           }
         }
       }
@@ -823,8 +857,6 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I,
     }
   }
 
-  std::string DemangledName =
-      getOclOrSpirvBuiltinDemangledName(CI->getCalledFunction()->getName());
   if (DemangledName.empty() && !HaveTypes)
     return;
 
@@ -835,8 +867,14 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I,
       continue;
 
     // Constants (nulls/undefs) are handled in insertAssignPtrTypeIntrs()
-    if (!isa<Instruction>(ArgOperand) && !isa<Argument>(ArgOperand))
-      continue;
+    if (!isa<Instruction>(ArgOperand) && !isa<Argument>(ArgOperand)) {
+      // However, we may have assumptions about the formal argument's type and
+      // may have a need to insert a ptr cast for the actual parameter of this
+      // call.
+      Argument *CalledArg = CalledF->getArg(OpIdx);
+      if (!GR->findDeducedElementType(CalledArg))
+        continue;
+    }
 
     Type *ExpectedType =
         OpIdx < CalledArgTys.size() ? CalledArgTys[OpIdx] : nullptr;
@@ -1102,9 +1140,13 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I,
                  (II->paramHasAttr(OpNo, Attribute::ImmArg))))
         continue;
       B.SetInsertPoint(I);
-      auto *NewOp =
-          buildIntrWithMD(Intrinsic::spv_track_constant,
-                          {Op->getType(), Op->getType()}, Op, Op, {}, B);
+      Value *OpTyVal = Op;
+      if (Op->getType()->isTargetExtTy())
+        OpTyVal = Constant::getNullValue(
+            IntegerType::get(I->getContext(), GR->getPointerSize()));
+      auto *NewOp = buildIntrWithMD(Intrinsic::spv_track_constant,
+                                    {Op->getType(), OpTyVal->getType()}, Op,
+                                    OpTyVal, {}, B);
       I->setOperand(OpNo, NewOp);
     }
   }
@@ -1179,28 +1221,29 @@ Type *SPIRVEmitIntrinsics::deduceFunParamElementType(
   return nullptr;
 }
 
-void SPIRVEmitIntrinsics::processParamTypes(Function *F, IRBuilder<> &B) {
+void SPIRVEmitIntrinsics::processParamTypesByFunHeader(Function *F,
+                                                       IRBuilder<> &B) {
   B.SetInsertPointPastAllocas(F);
   for (unsigned OpIdx = 0; OpIdx < F->arg_size(); ++OpIdx) {
     Argument *Arg = F->getArg(OpIdx);
     if (!isUntypedPointerTy(Arg->getType()))
       continue;
+    Type *ElemTy = GR->findDeducedElementType(Arg);
+    if (!ElemTy && hasPointeeTypeAttr(Arg) &&
+        (ElemTy = getPointeeTypeByAttr(Arg)) != nullptr)
+      buildAssignPtr(B, ElemTy, Arg);
+  }
+}
 
+void SPIRVEmitIntrinsics::processParamTypes(Function *F, IRBuilder<> &B) {
+  B.SetInsertPointPastAllocas(F);
+  for (unsigned OpIdx = 0; OpIdx < F->arg_size(); ++OpIdx) {
+    Argument *Arg = F->getArg(OpIdx);
+    if (!isUntypedPointerTy(Arg->getType()))
+      continue;
     Type *ElemTy = GR->findDeducedElementType(Arg);
-    if (!ElemTy) {
-      if (hasPointeeTypeAttr(Arg) &&
-          (ElemTy = getPointeeTypeByAttr(Arg)) != nullptr) {
-        GR->addDeducedElementType(Arg, ElemTy);
-      } else if ((ElemTy = deduceFunParamElementType(F, OpIdx)) != nullptr) {
-        CallInst *AssignPtrTyCI = buildIntrWithMD(
-            Intrinsic::spv_assign_ptr_type, {Arg->getType()},
-            Constant::getNullValue(ElemTy), Arg,
-            {B.getInt32(getPointerAddressSpace(Arg->getType()))}, B);
-        GR->addDeducedElementType(AssignPtrTyCI, ElemTy);
-        GR->addDeducedElementType(Arg, ElemTy);
-        AssignPtrTypeInstr[Arg] = AssignPtrTyCI;
-      }
-    }
+    if (!ElemTy && (ElemTy = deduceFunParamElementType(F, OpIdx)) != nullptr)
+      buildAssignPtr(B, ElemTy, Arg);
   }
 }
 
@@ -1217,6 +1260,8 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
   AggrConstTypes.clear();
   AggrStores.clear();
 
+  processParamTypesByFunHeader(F, B);
+
   // StoreInst's operand type can be changed during the next transformations,
   // so we need to store it in the set. Also store already transformed types.
   for (auto &I : instructions(Func)) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
index b8296c3f6eea..96b4a570a26b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
@@ -314,6 +314,16 @@ void SPIRVTargetLowering::finalizeLowering(MachineFunction &MF) const {
                                       SPIRV::OpTypeBool))
           MI.setDesc(STI.getInstrInfo()->get(SPIRV::OpLogicalNotEqual));
         break;
+      case SPIRV::OpConstantI: {
+        SPIRVType *Type = GR.getSPIRVTypeForVReg(MI.getOperand(1).getReg());
+        if (Type->getOpcode() != SPIRV::OpTypeInt && MI.getOperand(2).isImm() &&
+            MI.getOperand(2).getImm() == 0) {
+          // Validate the null constant of a target extension type
+          MI.setDesc(STI.getInstrInfo()->get(SPIRV::OpConstantNull));
+          for (unsigned i = MI.getNumOperands() - 1; i > 1; --i)
+            MI.removeOperand(i);
+        }
+      } break;
       }
     }
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index 9ee0b38d2233..84508fb5fe09 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -38,7 +38,9 @@ public:
 };
 } // namespace
 
-static void addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR) {
+static void
+addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR,
+                    DenseMap<MachineInstr *, Type *> &TargetExtConstTypes) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   DenseMap<MachineInstr *, Register> RegsAlreadyAddedToDT;
   SmallVector<MachineInstr *, 10> ToErase, ToEraseComposites;
@@ -47,6 +49,7 @@ static void addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR) {
       if (!isSpvIntrinsic(MI, Intrinsic::spv_track_constant))
         continue;
       ToErase.push_back(&MI);
+      Register SrcReg = MI.getOperand(2).getReg();
       auto *Const =
           cast<Constant>(cast<ConstantAsMetadata>(
                              MI.getOperand(3).getMetadata()->getOperand(0))
@@ -54,14 +57,14 @@ static void addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR) {
       if (auto *GV = dyn_cast<GlobalValue>(Const)) {
         Register Reg = GR->find(GV, &MF);
         if (!Reg.isValid())
-          GR->add(GV, &MF, MI.getOperand(2).getReg());
+          GR->add(GV, &MF, SrcReg);
         else
           RegsAlreadyAddedToDT[&MI] = Reg;
       } else {
         Register Reg = GR->find(Const, &MF);
         if (!Reg.isValid()) {
           if (auto *ConstVec = dyn_cast<ConstantDataVector>(Const)) {
-            auto *BuildVec = MRI.getVRegDef(MI.getOperand(2).getReg());
+            auto *BuildVec = MRI.getVRegDef(SrcReg);
             assert(BuildVec &&
                    BuildVec->getOpcode() == TargetOpcode::G_BUILD_VECTOR);
             for (unsigned i = 0; i < ConstVec->getNumElements(); ++i) {
@@ -75,7 +78,13 @@ static void addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR) {
                 BuildVec->getOperand(1 + i).setReg(ElemReg);
             }
           }
-          GR->add(Const, &MF, MI.getOperand(2).getReg());
+          GR->add(Const, &MF, SrcReg);
+          if (Const->getType()->isTargetExtTy()) {
+            // remember association so that we can restore it when assign types
+            MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
+            if (SrcMI && SrcMI->getOpcode() == TargetOpcode::G_CONSTANT)
+              TargetExtConstTypes[SrcMI] = Const->getType();
+          }
         } else {
           RegsAlreadyAddedToDT[&MI] = Reg;
           // This MI is unused and will be removed. If the MI uses
@@ -364,8 +373,10 @@ void processInstr(MachineInstr &MI, MachineIRBuilder &MIB,
 }
 } // namespace llvm
 
-static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
-                                 MachineIRBuilder MIB) {
+static void
+generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
+                     MachineIRBuilder MIB,
+                     DenseMap<MachineInstr *, Type *> &TargetExtConstTypes) {
   // Get access to information about available extensions
   const SPIRVSubtarget *ST =
       static_cast<const SPIRVSubtarget *>(&MIB.getMF().getSubtarget());
@@ -422,11 +433,14 @@ static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
             continue;
         }
         Type *Ty = nullptr;
-        if (MI.getOpcode() == TargetOpcode::G_CONSTANT)
-          Ty = MI.getOperand(1).getCImm()->getType();
-        else if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
+        if (MI.getOpcode() == TargetOpcode::G_CONSTANT) {
+          auto TargetExtIt = TargetExtConstTypes.find(&MI);
+          Ty = TargetExtIt == TargetExtConstTypes.end()
+                   ? MI.getOperand(1).getCImm()->getType()
+                   : TargetExtIt->second;
+        } else if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) {
           Ty = MI.getOperand(1).getFPImm()->getType();
-        else {
+        } else {
           assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
           Type *ElemTy = nullptr;
           MachineInstr *ElemMI = MRI.getVRegDef(MI.getOperand(1).getReg());
@@ -616,10 +630,12 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) {
   SPIRVGlobalRegistry *GR = ST.getSPIRVGlobalRegistry();
   GR->setCurrentFunc(MF);
   MachineIRBuilder MIB(MF);
-  addConstantsToTrack(MF, GR);
+  // a registry of target extension constants
+  DenseMap<MachineInstr *, Type *> TargetExtConstTypes;
+  addConstantsToTrack(MF, GR, TargetExtConstTypes);
   foldConstantsIntoIntrinsics(MF);
   insertBitcasts(MF, GR, MIB);
-  generateAssignInstrs(MF, GR, MIB);
+  generateAssignInstrs(MF, GR, MIB, TargetExtConstTypes);
   processSwitches(MF, GR, MIB);
   processInstrsWithTypeFolding(MF, GR, MIB);
   removeImplicitFallthroughs(MF, MIB);
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 115f34fa7751..2da4431cf077 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -9631,7 +9631,7 @@ SDValue SystemZTargetLowering::lowerVECREDUCE_ADD(SDValue Op,
   case 8:
   case 16:
     Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::v4i32, Op, Zero);
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   case 32:
   case 64:
     Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::i128, Op,
diff --git a/llvm/lib/Target/WebAssembly/CMakeLists.txt b/llvm/lib/Target/WebAssembly/CMakeLists.txt
index f430be2653b4..1e83cbeac50d 100644
--- a/llvm/lib/Target/WebAssembly/CMakeLists.txt
+++ b/llvm/lib/Target/WebAssembly/CMakeLists.txt
@@ -19,6 +19,7 @@ add_llvm_target(WebAssemblyCodeGen
   WebAssemblyArgumentMove.cpp
   WebAssemblyAsmPrinter.cpp
   WebAssemblyCFGStackify.cpp
+  WebAssemblyCleanCodeAfterTrap.cpp
   WebAssemblyCFGSort.cpp
   WebAssemblyDebugFixup.cpp
   WebAssemblyDebugValueManager.cpp
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp
index b7b5b2a97c59..8ea02bd2ad1f 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp
@@ -18,24 +18,16 @@
 using namespace llvm;
 
 std::optional<wasm::ValType> WebAssembly::parseType(StringRef Type) {
-  // FIXME: can't use StringSwitch because wasm::ValType doesn't have a
-  // "invalid" value.
-  if (Type == "i32")
-    return wasm::ValType::I32;
-  if (Type == "i64")
-    return wasm::ValType::I64;
-  if (Type == "f32")
-    return wasm::ValType::F32;
-  if (Type == "f64")
-    return wasm::ValType::F64;
-  if (Type == "v128" || Type == "i8x16" || Type == "i16x8" || Type == "i32x4" ||
-      Type == "i64x2" || Type == "f32x4" || Type == "f64x2")
-    return wasm::ValType::V128;
-  if (Type == "funcref")
-    return wasm::ValType::FUNCREF;
-  if (Type == "externref")
-    return wasm::ValType::EXTERNREF;
-  return std::nullopt;
+  return llvm::StringSwitch<std::optional<wasm::ValType>>{Type}
+      .Case("i32", wasm::ValType::I32)
+      .Case("i64", wasm::ValType::I64)
+      .Case("f32", wasm::ValType::F32)
+      .Case("f64", wasm::ValType::F64)
+      .Cases("v128", "i8x16", "i16x8", "i32x4", "i64x2", "f32x4", "f64x2",
+             wasm::ValType::V128)
+      .Case("funcref", wasm::ValType::FUNCREF)
+      .Case("externref", wasm::ValType::EXTERNREF)
+      .Default(std::nullopt);
 }
 
 WebAssembly::BlockType WebAssembly::parseBlockType(StringRef Type) {
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.h b/llvm/lib/Target/WebAssembly/WebAssembly.h
index 1c40addb6d6f..7fc8546248f1 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.h
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.h
@@ -37,6 +37,7 @@ FunctionPass *createWebAssemblyISelDag(WebAssemblyTargetMachine &TM,
                                        CodeGenOptLevel OptLevel);
 FunctionPass *createWebAssemblyArgumentMove();
 FunctionPass *createWebAssemblySetP2AlignOperands();
+FunctionPass *createWebAssemblyCleanCodeAfterTrap();
 
 // Late passes.
 FunctionPass *createWebAssemblyReplacePhysRegs();
@@ -63,6 +64,7 @@ void initializeOptimizeReturnedPass(PassRegistry &);
 void initializeWebAssemblyRefTypeMem2LocalPass(PassRegistry &);
 void initializeWebAssemblyAddMissingPrototypesPass(PassRegistry &);
 void initializeWebAssemblyArgumentMovePass(PassRegistry &);
+void initializeWebAssemblyCleanCodeAfterTrapPass(PassRegistry &);
 void initializeWebAssemblyCFGSortPass(PassRegistry &);
 void initializeWebAssemblyCFGStackifyPass(PassRegistry &);
 void initializeWebAssemblyDAGToDAGISelPass(PassRegistry &);
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.td b/llvm/lib/Target/WebAssembly/WebAssembly.td
index d538197450b6..f00974531209 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.td
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.td
@@ -28,6 +28,9 @@ def FeatureSIMD128 : SubtargetFeature<"simd128", "SIMDLevel", "SIMD128",
 def FeatureRelaxedSIMD : SubtargetFeature<"relaxed-simd", "SIMDLevel", "RelaxedSIMD",
                                       "Enable relaxed-simd instructions">;
 
+def FeatureHalfPrecision : SubtargetFeature<"half-precision", "HasHalfPrecision", "true",
+                                            "Enable half precision instructions">;
+
 def FeatureAtomics : SubtargetFeature<"atomics", "HasAtomics", "true",
                                       "Enable Atomics">;
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCleanCodeAfterTrap.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCleanCodeAfterTrap.cpp
new file mode 100644
index 000000000000..e5cba3c48547
--- /dev/null
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCleanCodeAfterTrap.cpp
@@ -0,0 +1,80 @@
+//===-- WebAssemblyCleanCodeAfterTrap.cpp - Clean Code After Trap ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file remove instruction after trap.
+/// ``llvm.trap`` will be convert as ``unreachable`` which is terminator.
+/// Instruction after terminator will cause validation failed.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "WebAssemblyUtilities.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-clean-code-after-trap"
+
+namespace {
+class WebAssemblyCleanCodeAfterTrap final : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyCleanCodeAfterTrap() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "WebAssembly Clean Code After Trap";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // end anonymous namespace
+
+char WebAssemblyCleanCodeAfterTrap::ID = 0;
+INITIALIZE_PASS(WebAssemblyCleanCodeAfterTrap, DEBUG_TYPE,
+                "WebAssembly Clean Code After Trap", false, false)
+
+FunctionPass *llvm::createWebAssemblyCleanCodeAfterTrap() {
+  return new WebAssemblyCleanCodeAfterTrap();
+}
+
+bool WebAssemblyCleanCodeAfterTrap::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG({
+    dbgs() << "********** CleanCodeAfterTrap **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  bool Changed = false;
+
+  for (MachineBasicBlock &BB : MF) {
+    bool HasTerminator = false;
+    llvm::SmallVector<MachineInstr *> RemoveMI{};
+    for (MachineInstr &MI : BB) {
+      if (HasTerminator)
+        RemoveMI.push_back(&MI);
+      if (MI.hasProperty(MCID::Trap) && MI.isTerminator())
+        HasTerminator = true;
+    }
+    if (!RemoveMI.empty()) {
+      Changed = true;
+      LLVM_DEBUG({
+        for (MachineInstr *MI : RemoveMI) {
+          llvm::dbgs() << "* remove ";
+          MI->print(llvm::dbgs());
+        }
+      });
+      for (MachineInstr *MI : RemoveMI)
+        MI->eraseFromParent();
+    }
+  }
+  return Changed;
+}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 59ea9247bd86..7b57f8ce90e0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -30,6 +30,10 @@ def HasRelaxedSIMD :
     Predicate<"Subtarget->hasRelaxedSIMD()">,
     AssemblerPredicate<(all_of FeatureRelaxedSIMD), "relaxed-simd">;
 
+def HasHalfPrecision :
+    Predicate<"Subtarget->hasHalfPrecision()">,
+    AssemblerPredicate<(all_of FeatureHalfPrecision), "half-precision">;
+
 def HasAtomics :
     Predicate<"Subtarget->hasAtomics()">,
     AssemblerPredicate<(all_of FeatureAtomics), "atomics">;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
index 85d02b087c78..8b0d9fa12de8 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -50,6 +50,7 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo {
   bool HasReferenceTypes = false;
   bool HasExtendedConst = false;
   bool HasMultiMemory = false;
+  bool HasHalfPrecision = false;
 
   /// What processor and OS we're targeting.
   Triple TargetTriple;
@@ -93,6 +94,7 @@ public:
   bool hasAddr64() const { return TargetTriple.isArch64Bit(); }
   bool hasSIMD128() const { return SIMDLevel >= SIMD128; }
   bool hasRelaxedSIMD() const { return SIMDLevel >= RelaxedSIMD; }
+  bool hasHalfPrecision() const { return HasHalfPrecision; }
   bool hasAtomics() const { return HasAtomics; }
   bool hasNontrappingFPToInt() const { return HasNontrappingFPToInt; }
   bool hasSignExt() const { return HasSignExt; }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index cdd39eeb6bbb..de342e896573 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -512,6 +512,10 @@ bool WebAssemblyPassConfig::addInstSelector() {
   // Eliminate range checks and add default targets to br_table instructions.
   addPass(createWebAssemblyFixBrTableDefaults());
 
+  // unreachable is terminator, non-terminator instruction after it is not
+  // allowed.
+  addPass(createWebAssemblyCleanCodeAfterTrap());
+
   return false;
 }
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index c0a75e215a40..8e4015783641 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -1168,33 +1168,34 @@ inline int getMemoryOperandNo(uint64_t TSFlags) {
 
 /// \returns true if the register is a XMM.
 inline bool isXMMReg(unsigned RegNo) {
-  assert(X86::XMM15 - X86::XMM0 == 15 &&
-         "XMM0-15 registers are not continuous");
-  assert(X86::XMM31 - X86::XMM16 == 15 &&
-         "XMM16-31 registers are not continuous");
+  static_assert(X86::XMM15 - X86::XMM0 == 15,
+                "XMM0-15 registers are not continuous");
+  static_assert(X86::XMM31 - X86::XMM16 == 15,
+                "XMM16-31 registers are not continuous");
   return (RegNo >= X86::XMM0 && RegNo <= X86::XMM15) ||
          (RegNo >= X86::XMM16 && RegNo <= X86::XMM31);
 }
 
 /// \returns true if the register is a YMM.
 inline bool isYMMReg(unsigned RegNo) {
-  assert(X86::YMM15 - X86::YMM0 == 15 &&
-         "YMM0-15 registers are not continuous");
-  assert(X86::YMM31 - X86::YMM16 == 15 &&
-         "YMM16-31 registers are not continuous");
+  static_assert(X86::YMM15 - X86::YMM0 == 15,
+                "YMM0-15 registers are not continuous");
+  static_assert(X86::YMM31 - X86::YMM16 == 15,
+                "YMM16-31 registers are not continuous");
   return (RegNo >= X86::YMM0 && RegNo <= X86::YMM15) ||
          (RegNo >= X86::YMM16 && RegNo <= X86::YMM31);
 }
 
 /// \returns true if the register is a ZMM.
 inline bool isZMMReg(unsigned RegNo) {
-  assert(X86::ZMM31 - X86::ZMM0 == 31 && "ZMM registers are not continuous");
+  static_assert(X86::ZMM31 - X86::ZMM0 == 31,
+                "ZMM registers are not continuous");
   return RegNo >= X86::ZMM0 && RegNo <= X86::ZMM31;
 }
 
 /// \returns true if \p RegNo is an apx extended register.
 inline bool isApxExtendedReg(unsigned RegNo) {
-  assert(X86::R31WH - X86::R16 == 95 && "EGPRs are not continuous");
+  static_assert(X86::R31WH - X86::R16 == 95, "EGPRs are not continuous");
   return RegNo >= X86::R16 && RegNo <= X86::R31WH;
 }
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bb43cbe15f52..a811ce43422e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4441,10 +4441,8 @@ static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
          "Expected a 128/256/512-bit vector type");
-
-  APInt Ones = APInt::getAllOnes(32);
   unsigned NumElts = VT.getSizeInBits() / 32;
-  SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
+  SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
   return DAG.getBitcast(VT, Vec);
 }
 
@@ -20394,14 +20392,16 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
   EVT SrcVT = In.getValueType();
   EVT DstSVT = DstVT.getVectorElementType();
   EVT SrcSVT = SrcVT.getVectorElementType();
+  unsigned NumDstEltBits = DstSVT.getSizeInBits();
+  unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
 
   // Check we have a truncation suited for PACKSS/PACKUS.
   if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
         (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
     return SDValue();
 
-  assert(SrcSVT.getSizeInBits() > DstSVT.getSizeInBits() && "Bad truncation");
-  unsigned NumStages = Log2_32(SrcSVT.getSizeInBits() / DstSVT.getSizeInBits());
+  assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
+  unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
 
   // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
   // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
@@ -20422,8 +20422,7 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
   if (Subtarget.hasAVX512() && NumStages > 1)
     return SDValue();
 
-  unsigned NumSrcEltBits = SrcVT.getScalarSizeInBits();
-  unsigned NumPackedSignBits = std::min<unsigned>(DstSVT.getSizeInBits(), 16);
+  unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
   unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
 
   // Truncate with PACKUS if we are truncating a vector with leading zero
@@ -20445,7 +20444,7 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
   // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
   // see through BITCASTs later on and combines/simplifications can't then use
   // it.
-  if (DstSVT == MVT::i32 && NumSignBits != SrcSVT.getSizeInBits() &&
+  if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
       !Subtarget.hasAVX512())
     return SDValue();
 
@@ -24140,8 +24139,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
               DAG.getConstant(1, DL, VT));
         else
           Neg = CmpOp0;
-        SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
-                                   Neg); // -(and (x, 0x1))
+        SDValue Mask = DAG.getNegative(Neg, DL, VT); // -(and (x, 0x1))
         SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
         return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2);  // And Op y
       }
@@ -27896,7 +27894,7 @@ static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
   SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
 
   // Begin by bitcasting the input to byte vector, then split those bytes
-  // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
+  // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
   // If the hi input nibble is zero then we add both results together, otherwise
   // we just take the hi result (by masking the lo result to zero before the
   // add).
@@ -28150,9 +28148,8 @@ static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
   // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
   if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
     SDValue Src = Op.getOperand(0);
-    SDValue Sub =
-        DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
-    return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
+    SDValue Neg = DAG.getNegative(Src, DL, VT);
+    return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
   }
 
   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
@@ -29373,10 +29370,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
   // +ve/-ve Amt = shift left/right.
   if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
                              VT == MVT::v8i16 || VT == MVT::v16i8)) {
-    if (Opc == ISD::SRL || Opc == ISD::SRA) {
-      SDValue Zero = DAG.getConstant(0, dl, VT);
-      Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
-    }
+    if (Opc == ISD::SRL || Opc == ISD::SRA)
+      Amt = DAG.getNegative(Amt, dl, VT);
     if (Opc == ISD::SHL || Opc == ISD::SRL)
       return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
     if (Opc == ISD::SRA)
@@ -45272,7 +45267,7 @@ static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG,
       ISD::getSetCCInverse(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
                            Cond.getOperand(0).getValueType());
   Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0),
-		                        Cond.getOperand(1), NewCC);
+                      Cond.getOperand(1), NewCC);
   return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
 }
 
diff --git a/llvm/lib/Target/X86/X86LowerTileCopy.cpp b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
index e7afc49240e5..fd05e16ac1ce 100644
--- a/llvm/lib/Target/X86/X86LowerTileCopy.cpp
+++ b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
@@ -20,6 +20,7 @@
 #include "X86InstrBuilder.h"
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -72,10 +73,28 @@ FunctionPass *llvm::createX86LowerTileCopyPass() {
 bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
   const X86InstrInfo *TII = ST.getInstrInfo();
+  const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+  BitVector GR64Regs =
+      TRI->getAllocatableSet(MF, TRI->getRegClass(X86::GR64RegClassID));
+  BitVector TILERegs =
+      TRI->getAllocatableSet(MF, TRI->getRegClass(X86::TILERegClassID));
   bool Changed = false;
 
   for (MachineBasicBlock &MBB : MF) {
-    for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
+    // There won't be a tile copy if no tile register live in.
+    bool HasTileCopy = false;
+    for (const auto &LI : MBB.liveins()) {
+      if (TILERegs.test(LI.PhysReg)) {
+        HasTileCopy = true;
+        break;
+      }
+    }
+    if (!HasTileCopy)
+      continue;
+    LiveRegUnits UsedRegs(*TRI);
+    UsedRegs.addLiveOuts(MBB);
+    for (MachineInstr &MI : llvm::make_early_inc_range(reverse(MBB))) {
+      UsedRegs.stepBackward(MI);
       if (!MI.isCopy())
         continue;
       MachineOperand &DstMO = MI.getOperand(0);
@@ -85,27 +104,41 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
       if (!X86::TILERegClass.contains(DstReg, SrcReg))
         continue;
 
-      const TargetRegisterInfo *TRI = ST.getRegisterInfo();
       // Allocate stack slot for tile register
       unsigned Size = TRI->getSpillSize(X86::TILERegClass);
       Align Alignment = TRI->getSpillAlign(X86::TILERegClass);
       int TileSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment);
-      // Allocate stack slot for stride register
-      Size = TRI->getSpillSize(X86::GR64RegClass);
-      Alignment = TRI->getSpillAlign(X86::GR64RegClass);
-      int StrideSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment);
 
-      // TODO: Pick a killed regiter to avoid save/reload. There is problem
-      // to get live interval in this stage.
-      Register GR64Cand = X86::RAX;
+      int StrideSS = 0;
+
+      // Pick a killed register to avoid a save/reload.
+      Register GR64Cand = X86::NoRegister;
+      for (auto RegT : GR64Regs.set_bits()) {
+        if (UsedRegs.available(RegT)) {
+          GR64Cand = RegT;
+          break;
+        }
+      }
 
       const DebugLoc &DL = MI.getDebugLoc();
-      // mov %rax (%sp)
-      BuildMI(MBB, MI, DL, TII->get(X86::IMPLICIT_DEF), GR64Cand);
-      addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64mr)), StrideSS)
-          .addReg(GR64Cand);
-      // mov 64 %rax
-      BuildMI(MBB, MI, DL, TII->get(X86::MOV64ri), GR64Cand).addImm(64);
+      if (GR64Cand) {
+        // mov 64 %reg
+        BuildMI(MBB, MI, DL, TII->get(X86::MOV64ri), GR64Cand).addImm(64);
+      } else {
+        // No available register? Save RAX and reload it after use.
+
+        // Allocate stack slot for stride register
+        Size = TRI->getSpillSize(X86::GR64RegClass);
+        Alignment = TRI->getSpillAlign(X86::GR64RegClass);
+        StrideSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment);
+
+        // mov %reg (%sp)
+        addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64mr)),
+                          StrideSS)
+            .addReg(X86::RAX);
+        // mov 64 %reg
+        BuildMI(MBB, MI, DL, TII->get(X86::MOV64ri), X86::RAX).addImm(64);
+      }
       // tilestored %tmm, (%sp, %idx)
 #define GET_EGPR_IF_ENABLED(OPC) (ST.hasEGPR() ? OPC##_EVEX : OPC)
       unsigned Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
@@ -120,10 +153,12 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
 #undef GET_EGPR_IF_ENABLED
       NewMI = addFrameReference(BuildMI(MBB, MI, DL, TII->get(Opc), DstReg),
                                 TileSS);
-      // restore %rax
-      // mov (%sp) %rax
-      addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm), GR64Cand),
-                        StrideSS);
+      if (!GR64Cand) {
+        // restore %rax
+        // mov (%sp) %rax
+        addFrameReference(
+            BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm), GR64Cand), StrideSS);
+      }
       MI.eraseFromParent();
       Changed = true;
     }
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index be0cf1596d0d..555ede9e9540 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -649,10 +649,11 @@ unsigned X86RegisterInfo::getNumSupportedRegs(const MachineFunction &MF) const {
   // APX registers (R16-R31)
   //
   // and try to return the minimum number of registers supported by the target.
-  assert((X86::R15WH + 1 == X86 ::YMM0) && (X86::YMM15 + 1 == X86::K0) &&
-         (X86::K6_K7 + 1 == X86::TMMCFG) && (X86::TMM7 + 1 == X86::R16) &&
-         (X86::R31WH + 1 == X86::NUM_TARGET_REGS) &&
-         "Register number may be incorrect");
+  static_assert((X86::R15WH + 1 == X86::YMM0) && (X86::YMM15 + 1 == X86::K0) &&
+                    (X86::K6_K7 + 1 == X86::TMMCFG) &&
+                    (X86::TMM7 + 1 == X86::R16) &&
+                    (X86::R31WH + 1 == X86::NUM_TARGET_REGS),
+                "Register number may be incorrect");
 
   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
   if (ST.hasEGPR())
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index a458b5f9ec8f..4d55a084b730 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -244,7 +244,8 @@ public:
   // TODO: Currently we're always allowing widening on CPUs without VLX,
   // because for many cases we don't have a better option.
   bool canExtendTo512DQ() const {
-    return hasAVX512() && (!hasVLX() || getPreferVectorWidth() >= 512);
+    return hasAVX512() && hasEVEX512() &&
+           (!hasVLX() || getPreferVectorWidth() >= 512);
   }
   bool canExtendTo512BW() const  {
     return hasBWI() && canExtendTo512DQ();
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index d2c9bae97364..0a93b06f40c2 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -245,8 +245,11 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
         .Case("0xd4a", "neoverse-e1")
         .Case("0xd0c", "neoverse-n1")
         .Case("0xd49", "neoverse-n2")
+        .Case("0xd8e", "neoverse-n3")
         .Case("0xd40", "neoverse-v1")
         .Case("0xd4f", "neoverse-v2")
+        .Case("0xd84", "neoverse-v3")
+        .Case("0xd83", "neoverse-v3ae")
         .Default("generic");
   }
 
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index ea0b56b9a133..20182fb06037 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -50,41 +50,8 @@ static const char *RISCVGImplications[] = {
 #define GET_SUPPORTED_EXTENSIONS
 #include "llvm/TargetParser/RISCVTargetParserDef.inc"
 
-static constexpr RISCVProfile SupportedProfiles[] = {
-    {"rvi20u32", "rv32i"},
-    {"rvi20u64", "rv64i"},
-    {"rva20u64", "rv64imafdc_ziccamoa_ziccif_zicclsm_ziccrse_zicntr_za128rs"},
-    {"rva20s64", "rv64imafdc_ziccamoa_ziccif_zicclsm_ziccrse_zicntr_zifencei_"
-                 "za128rs_ssccptr_sstvala_sstvecd_svade_svbare"},
-    {"rva22u64",
-     "rv64imafdc_zic64b_zicbom_zicbop_zicboz_ziccamoa_ziccif_zicclsm_ziccrse_"
-     "zicntr_zihintpause_zihpm_za64rs_zfhmin_zba_zbb_zbs_zkt"},
-    {"rva22s64",
-     "rv64imafdc_zic64b_zicbom_zicbop_zicboz_ziccamoa_ziccif_zicclsm_ziccrse_"
-     "zicntr_zifencei_zihintpause_zihpm_za64rs_zfhmin_zba_zbb_zbs_zkt_ssccptr_"
-     "sscounterenw_sstvala_sstvecd_svade_svbare_svinval_svpbmt"},
-    {"rva23u64",
-     "rv64imafdcv_zic64b_zicbom_zicbop_zicboz_ziccamoa_ziccif_zicclsm_ziccrse_"
-     "zicntr_zicond_zihintntl_zihintpause_zihpm_zimop_za64rs_zawrs_zfa_zfhmin_"
-     "zcb_zcmop_zba_zbb_zbs_zkt_zvbb_zvfhmin_zvkt"},
-    {"rva23s64",
-     "rv64imafdcvh_zic64b_zicbom_zicbop_zicboz_ziccamoa_ziccif_zicclsm_ziccrse_"
-     "zicntr_zicond_zifencei_zihintntl_zihintpause_zihpm_zimop_za64rs_zawrs_"
-     "zfa_zfhmin_zcb_zcmop_zba_zbb_zbs_zkt_zvbb_zvfhmin_zvkt_shcounterenw_"
-     "shgatpa_shtvala_shvsatpa_shvstvala_shvstvecd_ssccptr_sscofpmf_"
-     "sscounterenw_ssnpm0p8_ssstateen_sstc_sstvala_sstvecd_ssu64xl_svade_"
-     "svbare_svinval_svnapot_svpbmt"},
-    {"rvb23u64", "rv64imafdc_zic64b_zicbom_zicbop_zicboz_ziccamoa_ziccif_"
-                 "zicclsm_ziccrse_zicntr_zicond_zihintntl_zihintpause_zihpm_"
-                 "zimop_za64rs_zawrs_zfa_zcb_zcmop_zba_zbb_zbs_zkt"},
-    {"rvb23s64",
-     "rv64imafdc_zic64b_zicbom_zicbop_zicboz_ziccamoa_ziccif_zicclsm_ziccrse_"
-     "zicntr_zicond_zifencei_zihintntl_zihintpause_zihpm_zimop_za64rs_zawrs_"
-     "zfa_zcb_zcmop_zba_zbb_zbs_zkt_ssccptr_sscofpmf_sscounterenw_sstc_sstvala_"
-     "sstvecd_ssu64xl_svade_svbare_svinval_svnapot_svpbmt"},
-    {"rvm23u32", "rv32im_zicbop_zicond_zicsr_zihintntl_zihintpause_zimop_zca_"
-                 "zcb_zce_zcmop_zcmp_zcmt_zba_zbb_zbs"},
-};
+#define GET_SUPPORTED_PROFILES
+#include "llvm/TargetParser/RISCVTargetParserDef.inc"
 
 static void verifyTables() {
 #ifndef NDEBUG
@@ -112,7 +79,7 @@ void llvm::riscvExtensionsHelp(StringMap<StringRef> DescMap) {
   outs() << "All available -march extensions for RISC-V\n\n";
   PrintExtension("Name", "Version", (DescMap.empty() ? "" : "Description"));
 
-  RISCVISAInfo::OrderedExtensionMap ExtMap;
+  RISCVISAUtils::OrderedExtensionMap ExtMap;
   for (const auto &E : SupportedExtensions)
     ExtMap[E.Name] = {E.Version.Major, E.Version.Minor};
   for (const auto &E : ExtMap) {
@@ -672,7 +639,8 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
   switch (Baseline) {
   default:
     return createStringError(errc::invalid_argument,
-                             "first letter should be 'e', 'i' or 'g'");
+                             "first letter after \'" + Arch.slice(0, 4) +
+                                 "\' should be 'e', 'i' or 'g'");
   case 'e':
   case 'i':
     break;
@@ -847,15 +815,21 @@ Error RISCVISAInfo::checkDependency() {
 
 struct ImpliedExtsEntry {
   StringLiteral Name;
-  ArrayRef<const char *> Exts;
+  const char *ImpliedExt;
 
   bool operator<(const ImpliedExtsEntry &Other) const {
     return Name < Other.Name;
   }
-
-  bool operator<(StringRef Other) const { return Name < Other; }
 };
 
+static bool operator<(const ImpliedExtsEntry &LHS, StringRef RHS) {
+  return LHS.Name < RHS;
+}
+
+static bool operator<(StringRef LHS, const ImpliedExtsEntry &RHS) {
+  return LHS < RHS.Name;
+}
+
 #define GET_IMPLIED_EXTENSIONS
 #include "llvm/TargetParser/RISCVTargetParserDef.inc"
 
@@ -880,18 +854,19 @@ void RISCVISAInfo::updateImplication() {
 
   while (!WorkList.empty()) {
     StringRef ExtName = WorkList.pop_back_val();
-    auto I = llvm::lower_bound(ImpliedExts, ExtName);
-    if (I != std::end(ImpliedExts) && I->Name == ExtName) {
-      for (const char *ImpliedExt : I->Exts) {
-        if (WorkList.count(ImpliedExt))
-          continue;
-        if (Exts.count(ImpliedExt))
-          continue;
-        auto Version = findDefaultVersion(ImpliedExt);
-        addExtension(ImpliedExt, Version.value());
-        WorkList.insert(ImpliedExt);
-      }
-    }
+    auto Range = std::equal_range(std::begin(ImpliedExts),
+                                  std::end(ImpliedExts), ExtName);
+    std::for_each(Range.first, Range.second,
+                  [&](const ImpliedExtsEntry &Implied) {
+                    const char *ImpliedExt = Implied.ImpliedExt;
+                    if (WorkList.count(ImpliedExt))
+                      return;
+                    if (Exts.count(ImpliedExt))
+                      return;
+                    auto Version = findDefaultVersion(ImpliedExt);
+                    addExtension(ImpliedExt, Version.value());
+                    WorkList.insert(ImpliedExt);
+                  });
   }
 
   // Add Zcf if Zce and F are enabled on RV32.
@@ -902,42 +877,34 @@ void RISCVISAInfo::updateImplication() {
   }
 }
 
-struct CombinedExtsEntry {
-  StringLiteral CombineExt;
-  ArrayRef<const char *> RequiredExts;
-};
-
-static constexpr CombinedExtsEntry CombineIntoExts[] = {
-    {{"zk"}, {ImpliedExtsZk}},
-    {{"zkn"}, {ImpliedExtsZkn}},
-    {{"zks"}, {ImpliedExtsZks}},
-    {{"zvkn"}, {ImpliedExtsZvkn}},
-    {{"zvknc"}, {ImpliedExtsZvknc}},
-    {{"zvkng"}, {ImpliedExtsZvkng}},
-    {{"zvks"}, {ImpliedExtsZvks}},
-    {{"zvksc"}, {ImpliedExtsZvksc}},
-    {{"zvksg"}, {ImpliedExtsZvksg}},
+static constexpr StringLiteral CombineIntoExts[] = {
+    {"zk"},    {"zkn"},  {"zks"},   {"zvkn"},  {"zvknc"},
+    {"zvkng"}, {"zvks"}, {"zvksc"}, {"zvksg"},
 };
 
 void RISCVISAInfo::updateCombination() {
-  bool IsNewCombine = false;
+  bool MadeChange = false;
   do {
-    IsNewCombine = false;
-    for (CombinedExtsEntry CombineIntoExt : CombineIntoExts) {
-      auto CombineExt = CombineIntoExt.CombineExt;
-      auto RequiredExts = CombineIntoExt.RequiredExts;
+    MadeChange = false;
+    for (StringRef CombineExt : CombineIntoExts) {
       if (hasExtension(CombineExt))
         continue;
-      bool IsAllRequiredFeatureExist = true;
-      for (const char *Ext : RequiredExts)
-        IsAllRequiredFeatureExist &= hasExtension(Ext);
-      if (IsAllRequiredFeatureExist) {
+
+      // Look up the extension in the ImpliesExt table to find everything it
+      // depends on.
+      auto Range = std::equal_range(std::begin(ImpliedExts),
+                                    std::end(ImpliedExts), CombineExt);
+      bool HasAllRequiredFeatures = std::all_of(
+          Range.first, Range.second, [&](const ImpliedExtsEntry &Implied) {
+            return hasExtension(Implied.ImpliedExt);
+          });
+      if (HasAllRequiredFeatures) {
         auto Version = findDefaultVersion(CombineExt);
         addExtension(CombineExt, Version.value());
-        IsNewCombine = true;
+        MadeChange = true;
       }
     }
-  } while (IsNewCombine);
+  } while (MadeChange);
 }
 
 void RISCVISAInfo::updateFLen() {
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index da714c9a7570..fbb83e787f63 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -306,6 +306,10 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV,
       APInt Offset(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0);
       PtrOp = PtrOp->stripAndAccumulateConstantOffsets(
           DL, Offset, /* AllowNonInbounds */ true);
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(PtrOp)) {
+        if (II->getIntrinsicID() == Intrinsic::threadlocal_address)
+          PtrOp = II->getArgOperand(0);
+      }
       if (PtrOp == GV) {
         if (auto *Value = ConstantFoldLoadFromConst(Init, Ty, Offset, DL)) {
           LI->replaceAllUsesWith(Value);
@@ -318,6 +322,9 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV,
     } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U)) { // memset/cpy/mv
       if (getUnderlyingObject(MI->getRawDest()) == GV)
         EraseFromParent(MI);
+    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+      if (II->getIntrinsicID() == Intrinsic::threadlocal_address)
+        append_range(WorkList, II->users());
     }
   }
 
diff --git a/llvm/lib/Transforms/IPO/SCCP.cpp b/llvm/lib/Transforms/IPO/SCCP.cpp
index f8920541e6fd..e591a8e73b1c 100644
--- a/llvm/lib/Transforms/IPO/SCCP.cpp
+++ b/llvm/lib/Transforms/IPO/SCCP.cpp
@@ -281,32 +281,21 @@ static bool runIPSCCP(
     Function *F = I.first;
     const ValueLatticeElement &ReturnValue = I.second;
 
-    // If there is a known constant range for the return value, add !range
-    // metadata to the function's call sites.
+    // If there is a known constant range for the return value, add range
+    // attribute to the return value.
     if (ReturnValue.isConstantRange() &&
         !ReturnValue.getConstantRange().isSingleElement()) {
       // Do not add range metadata if the return value may include undef.
       if (ReturnValue.isConstantRangeIncludingUndef())
         continue;
 
+      // Do not touch existing attribute for now.
+      // TODO: We should be able to take the intersection of the existing
+      // attribute and the inferred range.
+      if (F->hasRetAttribute(Attribute::Range))
+        continue;
       auto &CR = ReturnValue.getConstantRange();
-      for (User *User : F->users()) {
-        auto *CB = dyn_cast<CallBase>(User);
-        if (!CB || CB->getCalledFunction() != F)
-          continue;
-
-        // Do not touch existing metadata for now.
-        // TODO: We should be able to take the intersection of the existing
-        // metadata and the inferred range.
-        if (CB->getMetadata(LLVMContext::MD_range))
-          continue;
-
-        LLVMContext &Context = CB->getParent()->getContext();
-        Metadata *RangeMD[] = {
-            ConstantAsMetadata::get(ConstantInt::get(Context, CR.getLower())),
-            ConstantAsMetadata::get(ConstantInt::get(Context, CR.getUpper()))};
-        CB->setMetadata(LLVMContext::MD_range, MDNode::get(Context, RangeMD));
-      }
+      F->addRangeRetAttr(CR);
       continue;
     }
     if (F->getReturnType()->isVoidTy())
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 0b3a6931e779..6cbd138842c8 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -252,20 +252,21 @@ static cl::opt<unsigned> PrecentMismatchForStalenessError(
 
 static cl::opt<bool> CallsitePrioritizedInline(
     "sample-profile-prioritized-inline", cl::Hidden,
-
     cl::desc("Use call site prioritized inlining for sample profile loader."
              "Currently only CSSPGO is supported."));
 
 static cl::opt<bool> UsePreInlinerDecision(
     "sample-profile-use-preinliner", cl::Hidden,
-
     cl::desc("Use the preinliner decisions stored in profile context."));
 
 static cl::opt<bool> AllowRecursiveInline(
     "sample-profile-recursive-inline", cl::Hidden,
-
     cl::desc("Allow sample loader inliner to inline recursive calls."));
 
+static cl::opt<bool> RemoveProbeAfterProfileAnnotation(
+    "sample-profile-remove-probe", cl::Hidden, cl::init(false),
+    cl::desc("Remove pseudo-probe after sample profile annotation."));
+
 static cl::opt<std::string> ProfileInlineReplayFile(
     "sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
     cl::desc(
@@ -518,6 +519,7 @@ protected:
   void generateMDProfMetadata(Function &F);
   bool rejectHighStalenessProfile(Module &M, ProfileSummaryInfo *PSI,
                                   const SampleProfileMap &Profiles);
+  void removePseudoProbeInsts(Module &M);
 
   /// Map from function name to Function *. Used to find the function from
   /// the function name. If the function name contains suffix, additional
@@ -2127,6 +2129,20 @@ bool SampleProfileLoader::rejectHighStalenessProfile(
   return false;
 }
 
+void SampleProfileLoader::removePseudoProbeInsts(Module &M) {
+  for (auto &F : M) {
+    std::vector<Instruction *> InstsToDel;
+    for (auto &BB : F) {
+      for (auto &I : BB) {
+        if (isa<PseudoProbeInst>(&I))
+          InstsToDel.push_back(&I);
+      }
+    }
+    for (auto *I : InstsToDel)
+      I->eraseFromParent();
+  }
+}
+
 bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
                                       ProfileSummaryInfo *_PSI,
                                       LazyCallGraph &CG) {
@@ -2196,6 +2212,9 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
          notInlinedCallInfo)
       updateProfileCallee(pair.first, pair.second.entryCount);
 
+  if (RemoveProbeAfterProfileAnnotation && FunctionSamples::ProfileIsProbeBased)
+    removePseudoProbeInsts(M);
+
   return retval;
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index ea2cdadc84f1..51ac77348ed9 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2001,43 +2001,30 @@ Value *InstCombinerImpl::OptimizePointerDifference(Value *LHS, Value *RHS,
   if (!GEP1)
     return nullptr;
 
-  if (GEP2) {
-    // (gep X, ...) - (gep X, ...)
-    //
-    // Avoid duplicating the arithmetic if there are more than one non-constant
-    // indices between the two GEPs and either GEP has a non-constant index and
-    // multiple users. If zero non-constant index, the result is a constant and
-    // there is no duplication. If one non-constant index, the result is an add
-    // or sub with a constant, which is no larger than the original code, and
-    // there's no duplicated arithmetic, even if either GEP has multiple
-    // users. If more than one non-constant indices combined, as long as the GEP
-    // with at least one non-constant index doesn't have multiple users, there
-    // is no duplication.
-    unsigned NumNonConstantIndices1 = GEP1->countNonConstantIndices();
-    unsigned NumNonConstantIndices2 = GEP2->countNonConstantIndices();
-    if (NumNonConstantIndices1 + NumNonConstantIndices2 > 1 &&
-        ((NumNonConstantIndices1 > 0 && !GEP1->hasOneUse()) ||
-         (NumNonConstantIndices2 > 0 && !GEP2->hasOneUse()))) {
-      return nullptr;
-    }
-  }
+  // To avoid duplicating the offset arithmetic, rewrite the GEP to use the
+  // computed offset. This may erase the original GEP, so be sure to cache the
+  // inbounds flag before emitting the offset.
+  // TODO: We should probably do this even if there is only one GEP.
+  bool RewriteGEPs = GEP2 != nullptr;
 
   // Emit the offset of the GEP and an intptr_t.
-  Value *Result = EmitGEPOffset(GEP1);
+  bool GEP1IsInBounds = GEP1->isInBounds();
+  Value *Result = EmitGEPOffset(GEP1, RewriteGEPs);
 
   // If this is a single inbounds GEP and the original sub was nuw,
   // then the final multiplication is also nuw.
   if (auto *I = dyn_cast<Instruction>(Result))
-    if (IsNUW && !GEP2 && !Swapped && GEP1->isInBounds() &&
+    if (IsNUW && !GEP2 && !Swapped && GEP1IsInBounds &&
         I->getOpcode() == Instruction::Mul)
       I->setHasNoUnsignedWrap();
 
   // If we have a 2nd GEP of the same base pointer, subtract the offsets.
   // If both GEPs are inbounds, then the subtract does not have signed overflow.
   if (GEP2) {
-    Value *Offset = EmitGEPOffset(GEP2);
+    bool GEP2IsInBounds = GEP2->isInBounds();
+    Value *Offset = EmitGEPOffset(GEP2, RewriteGEPs);
     Result = Builder.CreateSub(Result, Offset, "gepdiff", /* NUW */ false,
-                               GEP1->isInBounds() && GEP2->isInBounds());
+                               GEP1IsInBounds && GEP2IsInBounds);
   }
 
   // If we have p - gep(p, ...)  then we have to negate the result.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 8ec1ed7529c1..ed9a89b14efc 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -3141,20 +3141,20 @@ Value *InstCombinerImpl::getSelectCondition(Value *A, Value *B,
   return nullptr;
 }
 
-/// We have an expression of the form (A & C) | (B & D). Try to simplify this
-/// to "A' ? C : D", where A' is a boolean or vector of booleans.
+/// We have an expression of the form (A & B) | (C & D). Try to simplify this
+/// to "A' ? B : D", where A' is a boolean or vector of booleans.
 /// When InvertFalseVal is set to true, we try to match the pattern
-/// where we have peeked through a 'not' op and A and B are the same:
-/// (A & C) | ~(A | D) --> (A & C) | (~A & ~D) --> A' ? C : ~D
-Value *InstCombinerImpl::matchSelectFromAndOr(Value *A, Value *C, Value *B,
+/// where we have peeked through a 'not' op and A and C are the same:
+/// (A & B) | ~(A | D) --> (A & B) | (~A & ~D) --> A' ? B : ~D
+Value *InstCombinerImpl::matchSelectFromAndOr(Value *A, Value *B, Value *C,
                                               Value *D, bool InvertFalseVal) {
   // The potential condition of the select may be bitcasted. In that case, look
   // through its bitcast and the corresponding bitcast of the 'not' condition.
   Type *OrigType = A->getType();
   A = peekThroughBitcast(A, true);
-  B = peekThroughBitcast(B, true);
-  if (Value *Cond = getSelectCondition(A, B, InvertFalseVal)) {
-    // ((bc Cond) & C) | ((bc ~Cond) & D) --> bc (select Cond, (bc C), (bc D))
+  C = peekThroughBitcast(C, true);
+  if (Value *Cond = getSelectCondition(A, C, InvertFalseVal)) {
+    // ((bc Cond) & B) | ((bc ~Cond) & D) --> bc (select Cond, (bc B), (bc D))
     // If this is a vector, we may need to cast to match the condition's length.
     // The bitcasts will either all exist or all not exist. The builder will
     // not create unnecessary casts if the types already match.
@@ -3168,11 +3168,11 @@ Value *InstCombinerImpl::matchSelectFromAndOr(Value *A, Value *C, Value *B,
       Type *EltTy = Builder.getIntNTy(SelEltSize / Elts);
       SelTy = VectorType::get(EltTy, VecTy->getElementCount());
     }
-    Value *BitcastC = Builder.CreateBitCast(C, SelTy);
+    Value *BitcastB = Builder.CreateBitCast(B, SelTy);
     if (InvertFalseVal)
       D = Builder.CreateNot(D);
     Value *BitcastD = Builder.CreateBitCast(D, SelTy);
-    Value *Select = Builder.CreateSelect(Cond, BitcastC, BitcastD);
+    Value *Select = Builder.CreateSelect(Cond, BitcastB, BitcastD);
     return Builder.CreateBitCast(Select, OrigType);
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index e5652458f150..1913ef92c16c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3168,7 +3168,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
-  case Intrinsic::experimental_vector_reverse: {
+  case Intrinsic::vector_reverse: {
     Value *BO0, *BO1, *X, *Y;
     Value *Vec = II->getArgOperand(0);
     if (match(Vec, m_OneUse(m_BinOp(m_Value(BO0), m_Value(BO1))))) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 1064340cb536..f66883de8dd5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -6889,8 +6889,8 @@ static Instruction *foldVectorCmp(CmpInst &Cmp,
     if (auto *I = dyn_cast<Instruction>(V))
       I->copyIRFlags(&Cmp);
     Module *M = Cmp.getModule();
-    Function *F = Intrinsic::getDeclaration(
-        M, Intrinsic::experimental_vector_reverse, V->getType());
+    Function *F =
+        Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, V->getType());
     return CallInst::Create(F, V);
   };
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index aafb4cf6ca6a..db7838bbe3c2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -354,8 +354,9 @@ private:
   }
 
   bool willNotOverflowUnsignedMul(const Value *LHS, const Value *RHS,
-                                  const Instruction &CxtI) const {
-    return computeOverflowForUnsignedMul(LHS, RHS, &CxtI) ==
+                                  const Instruction &CxtI,
+                                  bool IsNSW = false) const {
+    return computeOverflowForUnsignedMul(LHS, RHS, &CxtI, IsNSW) ==
            OverflowResult::NeverOverflows;
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 4ed4c36e21e0..ca1b1921404d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -530,7 +530,7 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
     I.setHasNoSignedWrap(true);
   }
 
-  if (!HasNUW && willNotOverflowUnsignedMul(Op0, Op1, I)) {
+  if (!HasNUW && willNotOverflowUnsignedMul(Op0, Op1, I, I.hasNoSignedWrap())) {
     Changed = true;
     I.setHasNoUnsignedWrap(true);
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 117eb7a1dcc9..8818369e7945 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -2537,8 +2537,8 @@ Instruction *InstCombinerImpl::foldVectorSelect(SelectInst &Sel) {
       if (auto *I = dyn_cast<Instruction>(V))
         I->copyIRFlags(&Sel);
       Module *M = Sel.getModule();
-      Function *F = Intrinsic::getDeclaration(
-          M, Intrinsic::experimental_vector_reverse, V->getType());
+      Function *F =
+          Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, V->getType());
       return CallInst::Create(F, V);
     };
 
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 58b2d8e9dec1..7356941be645 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2043,8 +2043,8 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {
     if (auto *BO = dyn_cast<BinaryOperator>(V))
       BO->copyIRFlags(&Inst);
     Module *M = Inst.getModule();
-    Function *F = Intrinsic::getDeclaration(
-        M, Intrinsic::experimental_vector_reverse, V->getType());
+    Function *F =
+        Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, V->getType());
     return CallInst::Create(F, V);
   };
 
@@ -2948,6 +2948,14 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     return nullptr;
 
   if (GEP.getNumIndices() == 1) {
+    // We can only preserve inbounds if the original gep is inbounds, the add
+    // is nsw, and the add operands are non-negative.
+    auto CanPreserveInBounds = [&](bool AddIsNSW, Value *Idx1, Value *Idx2) {
+      SimplifyQuery Q = SQ.getWithInstruction(&GEP);
+      return GEP.isInBounds() && AddIsNSW && isKnownNonNegative(Idx1, Q) &&
+             isKnownNonNegative(Idx2, Q);
+    };
+
     // Try to replace ADD + GEP with GEP + GEP.
     Value *Idx1, *Idx2;
     if (match(GEP.getOperand(1),
@@ -2957,10 +2965,15 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // as:
       //   %newptr = getelementptr i32, ptr %ptr, i64 %idx1
       //   %newgep = getelementptr i32, ptr %newptr, i64 %idx2
-      auto *NewPtr = Builder.CreateGEP(GEP.getResultElementType(),
-                                       GEP.getPointerOperand(), Idx1);
-      return GetElementPtrInst::Create(GEP.getResultElementType(), NewPtr,
-                                       Idx2);
+      bool IsInBounds = CanPreserveInBounds(
+          cast<OverflowingBinaryOperator>(GEP.getOperand(1))->hasNoSignedWrap(),
+          Idx1, Idx2);
+      auto *NewPtr =
+          Builder.CreateGEP(GEP.getResultElementType(), GEP.getPointerOperand(),
+                            Idx1, "", IsInBounds);
+      return replaceInstUsesWith(
+          GEP, Builder.CreateGEP(GEP.getResultElementType(), NewPtr, Idx2, "",
+                                 IsInBounds));
     }
     ConstantInt *C;
     if (match(GEP.getOperand(1), m_OneUse(m_SExtLike(m_OneUse(m_NSWAdd(
@@ -2971,12 +2984,17 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // as:
       // %newptr = getelementptr i32, ptr %ptr, i32 %idx1
       // %newgep = getelementptr i32, ptr %newptr, i32 idx2
+      bool IsInBounds = CanPreserveInBounds(
+          /*IsNSW=*/true, Idx1, C);
       auto *NewPtr = Builder.CreateGEP(
           GEP.getResultElementType(), GEP.getPointerOperand(),
-          Builder.CreateSExt(Idx1, GEP.getOperand(1)->getType()));
-      return GetElementPtrInst::Create(
-          GEP.getResultElementType(), NewPtr,
-          Builder.CreateSExt(C, GEP.getOperand(1)->getType()));
+          Builder.CreateSExt(Idx1, GEP.getOperand(1)->getType()), "",
+          IsInBounds);
+      return replaceInstUsesWith(
+          GEP,
+          Builder.CreateGEP(GEP.getResultElementType(), NewPtr,
+                            Builder.CreateSExt(C, GEP.getOperand(1)->getType()),
+                            "", IsInBounds));
     }
   }
 
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 88b852340340..fa661b17c13a 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1385,14 +1385,6 @@ bool HWAddressSanitizer::instrumentLandingPads(
   return true;
 }
 
-static DbgAssignIntrinsic *DynCastToDbgAssign(DbgVariableIntrinsic *DVI) {
-  return dyn_cast<DbgAssignIntrinsic>(DVI);
-}
-
-static DbgVariableRecord *DynCastToDbgAssign(DbgVariableRecord *DVR) {
-  return DVR->isDbgAssign() ? DVR : nullptr;
-}
-
 bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
                                          Value *StackTag, Value *UARTag,
                                          const DominatorTree &DT,
@@ -1448,28 +1440,7 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
              !memtag::isLifetimeIntrinsic(User);
     });
 
-    // Helper utility for adding DW_OP_LLVM_tag_offset to debug-info records,
-    // abstracted over whether they're intrinsic-stored or DbgVariableRecord
-    // stored.
-    auto AnnotateDbgRecord = [&](auto *DPtr) {
-      // Prepend "tag_offset, N" to the dwarf expression.
-      // Tag offset logically applies to the alloca pointer, and it makes sense
-      // to put it at the beginning of the expression.
-      SmallVector<uint64_t, 8> NewOps = {dwarf::DW_OP_LLVM_tag_offset,
-                                         retagMask(N)};
-      for (size_t LocNo = 0; LocNo < DPtr->getNumVariableLocationOps(); ++LocNo)
-        if (DPtr->getVariableLocationOp(LocNo) == AI)
-          DPtr->setExpression(DIExpression::appendOpsToArg(
-              DPtr->getExpression(), NewOps, LocNo));
-      if (auto *DAI = DynCastToDbgAssign(DPtr)) {
-        if (DAI->getAddress() == AI)
-          DAI->setAddressExpression(DIExpression::prependOpcodes(
-              DAI->getAddressExpression(), NewOps));
-      }
-    };
-
-    llvm::for_each(Info.DbgVariableIntrinsics, AnnotateDbgRecord);
-    llvm::for_each(Info.DbgVariableRecords, AnnotateDbgRecord);
+    memtag::annotateDebugRecords(Info, retagMask(N));
 
     auto TagEnd = [&](Instruction *Node) {
       IRB.SetInsertPoint(Node);
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index e5ef0333696d..cc2295c44023 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -1955,8 +1955,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       unsigned ArgOffset = 0;
       const DataLayout &DL = F->getParent()->getDataLayout();
       for (auto &FArg : F->args()) {
-        if (!FArg.getType()->isSized()) {
-          LLVM_DEBUG(dbgs() << "Arg is not sized\n");
+        if (!FArg.getType()->isSized() || FArg.getType()->isScalableTy()) {
+          LLVM_DEBUG(dbgs() << (FArg.getType()->isScalableTy()
+                                    ? "vscale not fully supported\n"
+                                    : "Arg is not sized\n"));
+          if (A == &FArg) {
+            ShadowPtr = getCleanShadow(V);
+            setOrigin(A, getCleanOrigin());
+            break;
+          }
           continue;
         }
 
@@ -2506,6 +2513,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy,
                           bool Signed = false) {
     Type *srcTy = V->getType();
+    if (srcTy == dstTy)
+      return V;
     size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
     size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
     if (srcSizeInBits > 1 && dstSizeInBits == 1)
@@ -4196,6 +4205,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         LLVM_DEBUG(dbgs() << "Arg " << i << " is not sized: " << CB << "\n");
         continue;
       }
+
+      if (A->getType()->isScalableTy()) {
+        LLVM_DEBUG(dbgs() << "Arg  " << i << " is vscale: " << CB << "\n");
+        // Handle as noundef, but don't reserve tls slots.
+        insertShadowCheck(A, &CB);
+        continue;
+      }
+
       unsigned Size = 0;
       const DataLayout &DL = F.getParent()->getDataLayout();
 
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 1caed93b1b66..ba2546b8db0e 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -131,7 +131,7 @@ public:
   explicit operator bool() const { return SI && SIUse; }
 };
 
-void unfold(DomTreeUpdater *DTU, SelectInstToUnfold SIToUnfold,
+void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold,
             std::vector<SelectInstToUnfold> *NewSIsToUnfold,
             std::vector<BasicBlock *> *NewBBs);
 
@@ -142,6 +142,7 @@ public:
       : AC(AC), DT(DT), LI(LI), TTI(TTI), ORE(ORE) {}
 
   bool run(Function &F);
+  bool LoopInfoBroken;
 
 private:
   void
@@ -157,7 +158,7 @@ private:
 
       std::vector<SelectInstToUnfold> NewSIsToUnfold;
       std::vector<BasicBlock *> NewBBs;
-      unfold(&DTU, SIToUnfold, &NewSIsToUnfold, &NewBBs);
+      unfold(&DTU, LI, SIToUnfold, &NewSIsToUnfold, &NewBBs);
 
       // Put newly discovered select instructions into the work list.
       for (const SelectInstToUnfold &NewSIToUnfold : NewSIsToUnfold)
@@ -201,7 +202,7 @@ void createBasicBlockAndSinkSelectInst(
 /// created basic blocks into \p NewBBs.
 ///
 /// TODO: merge it with CodeGenPrepare::optimizeSelectInst() if possible.
-void unfold(DomTreeUpdater *DTU, SelectInstToUnfold SIToUnfold,
+void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold,
             std::vector<SelectInstToUnfold> *NewSIsToUnfold,
             std::vector<BasicBlock *> *NewBBs) {
   SelectInst *SI = SIToUnfold.getInst();
@@ -307,6 +308,12 @@ void unfold(DomTreeUpdater *DTU, SelectInstToUnfold SIToUnfold,
   DTU->applyUpdates({{DominatorTree::Insert, StartBlock, TT},
                      {DominatorTree::Insert, StartBlock, FT}});
 
+  // Preserve loop info
+  if (Loop *L = LI->getLoopFor(SI->getParent())) {
+    for (BasicBlock *NewBB : *NewBBs)
+      L->addBasicBlockToLoop(NewBB, *LI);
+  }
+
   // The select is now dead.
   assert(SI->use_empty() && "Select must be dead now");
   SI->eraseFromParent();
@@ -522,9 +529,10 @@ private:
 };
 
 struct AllSwitchPaths {
-  AllSwitchPaths(const MainSwitch *MSwitch, OptimizationRemarkEmitter *ORE)
-      : Switch(MSwitch->getInstr()), SwitchBlock(Switch->getParent()),
-        ORE(ORE) {}
+  AllSwitchPaths(const MainSwitch *MSwitch, OptimizationRemarkEmitter *ORE,
+                 LoopInfo *LI)
+      : Switch(MSwitch->getInstr()), SwitchBlock(Switch->getParent()), ORE(ORE),
+        LI(LI) {}
 
   std::vector<ThreadingPath> &getThreadingPaths() { return TPaths; }
   unsigned getNumThreadingPaths() { return TPaths.size(); }
@@ -545,7 +553,7 @@ struct AllSwitchPaths {
       return;
     }
 
-    for (PathType Path : LoopPaths) {
+    for (const PathType &Path : LoopPaths) {
       ThreadingPath TPath;
 
       const BasicBlock *PrevBB = Path.back();
@@ -596,6 +604,12 @@ private:
 
     Visited.insert(BB);
 
+    // Stop if we have reached the BB out of loop, since its successors have no
+    // impact on the DFA.
+    // TODO: Do we need to stop exploring if BB is the outer loop of the switch?
+    if (!LI->getLoopFor(BB))
+      return Res;
+
     // Some blocks have multiple edges to the same successor, and this set
     // is used to prevent a duplicate path from being generated
     SmallSet<BasicBlock *, 4> Successors;
@@ -737,6 +751,7 @@ private:
   BasicBlock *SwitchBlock;
   OptimizationRemarkEmitter *ORE;
   std::vector<ThreadingPath> TPaths;
+  LoopInfo *LI;
 };
 
 struct TransformDFA {
@@ -1283,6 +1298,7 @@ bool DFAJumpThreading::run(Function &F) {
 
   SmallVector<AllSwitchPaths, 2> ThreadableLoops;
   bool MadeChanges = false;
+  LoopInfoBroken = false;
 
   for (BasicBlock &BB : F) {
     auto *SI = dyn_cast<SwitchInst>(BB.getTerminator());
@@ -1304,7 +1320,7 @@ bool DFAJumpThreading::run(Function &F) {
     if (!Switch.getSelectInsts().empty())
       MadeChanges = true;
 
-    AllSwitchPaths SwitchPaths(&Switch, ORE);
+    AllSwitchPaths SwitchPaths(&Switch, ORE, LI);
     SwitchPaths.run();
 
     if (SwitchPaths.getNumThreadingPaths() > 0) {
@@ -1315,10 +1331,15 @@ bool DFAJumpThreading::run(Function &F) {
       // strict requirement but it can cause buggy behavior if there is an
       // overlap of blocks in different opportunities. There is a lot of room to
       // experiment with catching more opportunities here.
+      // NOTE: To release this contraint, we must handle LoopInfo invalidation
       break;
     }
   }
 
+#ifdef NDEBUG
+  LI->verify(*DT);
+#endif
+
   SmallPtrSet<const Value *, 32> EphValues;
   if (ThreadableLoops.size() > 0)
     CodeMetrics::collectEphemeralValues(&F, AC, EphValues);
@@ -1327,6 +1348,7 @@ bool DFAJumpThreading::run(Function &F) {
     TransformDFA Transform(&SwitchPaths, DT, AC, TTI, ORE, EphValues);
     Transform.run();
     MadeChanges = true;
+    LoopInfoBroken = true;
   }
 
 #ifdef EXPENSIVE_CHECKS
@@ -1347,11 +1369,13 @@ PreservedAnalyses DFAJumpThreadingPass::run(Function &F,
   LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
   TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
   OptimizationRemarkEmitter ORE(&F);
-
-  if (!DFAJumpThreading(&AC, &DT, &LI, &TTI, &ORE).run(F))
+  DFAJumpThreading ThreadImpl(&AC, &DT, &LI, &TTI, &ORE);
+  if (!ThreadImpl.run(F))
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
   PA.preserve<DominatorTreeAnalysis>();
+  if (!ThreadImpl.LoopInfoBroken)
+    PA.preserve<LoopAnalysis>();
   return PA;
 }
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index d829e92b2444..b5be8ac24941 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -726,6 +726,69 @@ void GVNPass::ValueTable::verifyRemoved(const Value *V) const {
 }
 
 //===----------------------------------------------------------------------===//
+//                     LeaderMap External Functions
+//===----------------------------------------------------------------------===//
+
+/// Push a new Value to the LeaderTable onto the list for its value number.
+void GVNPass::LeaderMap::insert(uint32_t N, Value *V, const BasicBlock *BB) {
+  LeaderListNode &Curr = NumToLeaders[N];
+  if (!Curr.Entry.Val) {
+    Curr.Entry.Val = V;
+    Curr.Entry.BB = BB;
+    return;
+  }
+
+  LeaderListNode *Node = TableAllocator.Allocate<LeaderListNode>();
+  Node->Entry.Val = V;
+  Node->Entry.BB = BB;
+  Node->Next = Curr.Next;
+  Curr.Next = Node;
+}
+
+/// Scan the list of values corresponding to a given
+/// value number, and remove the given instruction if encountered.
+void GVNPass::LeaderMap::erase(uint32_t N, Instruction *I,
+                               const BasicBlock *BB) {
+  LeaderListNode *Prev = nullptr;
+  LeaderListNode *Curr = &NumToLeaders[N];
+
+  while (Curr && (Curr->Entry.Val != I || Curr->Entry.BB != BB)) {
+    Prev = Curr;
+    Curr = Curr->Next;
+  }
+
+  if (!Curr)
+    return;
+
+  if (Prev) {
+    Prev->Next = Curr->Next;
+  } else {
+    if (!Curr->Next) {
+      Curr->Entry.Val = nullptr;
+      Curr->Entry.BB = nullptr;
+    } else {
+      LeaderListNode *Next = Curr->Next;
+      Curr->Entry.Val = Next->Entry.Val;
+      Curr->Entry.BB = Next->Entry.BB;
+      Curr->Next = Next->Next;
+    }
+  }
+}
+
+void GVNPass::LeaderMap::verifyRemoved(const Value *V) const {
+  // Walk through the value number scope to make sure the instruction isn't
+  // ferreted away in it.
+  for (const auto &I : NumToLeaders) {
+    (void)I;
+    assert(I.second.Entry.Val != V && "Inst still in value numbering scope!");
+    assert(
+        std::none_of(leader_iterator(&I.second), leader_iterator(nullptr),
+                     [=](const LeaderTableEntry &E) { return E.Val == V; }) &&
+        "Inst still in value numbering scope!");
+  }
+}
+
+//===----------------------------------------------------------------------===//
 //                                GVN Pass
 //===----------------------------------------------------------------------===//
 
@@ -1467,7 +1530,7 @@ void GVNPass::eliminatePartiallyRedundantLoad(
         OldLoad->replaceAllUsesWith(NewLoad);
         replaceValuesPerBlockEntry(ValuesPerBlock, OldLoad, NewLoad);
         if (uint32_t ValNo = VN.lookup(OldLoad, false))
-          removeFromLeaderTable(ValNo, OldLoad, OldLoad->getParent());
+          LeaderTable.erase(ValNo, OldLoad, OldLoad->getParent());
         VN.erase(OldLoad);
         removeInstruction(OldLoad);
       }
@@ -2204,10 +2267,9 @@ GVNPass::ValueTable::assignExpNewValueNum(Expression &Exp) {
 /// defined in \p BB.
 bool GVNPass::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB,
                                          GVNPass &Gvn) {
-  LeaderTableEntry *Vals = &Gvn.LeaderTable[Num];
-  while (Vals && Vals->BB == BB)
-    Vals = Vals->Next;
-  return !Vals;
+  return all_of(
+      Gvn.LeaderTable.getLeaders(Num),
+      [=](const LeaderMap::LeaderTableEntry &L) { return L.BB == BB; });
 }
 
 /// Wrap phiTranslateImpl to provide caching functionality.
@@ -2229,12 +2291,11 @@ bool GVNPass::ValueTable::areCallValsEqual(uint32_t Num, uint32_t NewNum,
                                            const BasicBlock *PhiBlock,
                                            GVNPass &Gvn) {
   CallInst *Call = nullptr;
-  LeaderTableEntry *Vals = &Gvn.LeaderTable[Num];
-  while (Vals) {
-    Call = dyn_cast<CallInst>(Vals->Val);
+  auto Leaders = Gvn.LeaderTable.getLeaders(Num);
+  for (const auto &Entry : Leaders) {
+    Call = dyn_cast<CallInst>(Entry.Val);
     if (Call && Call->getParent() == PhiBlock)
       break;
-    Vals = Vals->Next;
   }
 
   if (AA->doesNotAccessMemory(Call))
@@ -2327,23 +2388,17 @@ void GVNPass::ValueTable::eraseTranslateCacheEntry(
 // question.  This is fast because dominator tree queries consist of only
 // a few comparisons of DFS numbers.
 Value *GVNPass::findLeader(const BasicBlock *BB, uint32_t num) {
-  LeaderTableEntry Vals = LeaderTable[num];
-  if (!Vals.Val) return nullptr;
+  auto Leaders = LeaderTable.getLeaders(num);
+  if (Leaders.empty())
+    return nullptr;
 
   Value *Val = nullptr;
-  if (DT->dominates(Vals.BB, BB)) {
-    Val = Vals.Val;
-    if (isa<Constant>(Val)) return Val;
-  }
-
-  LeaderTableEntry* Next = Vals.Next;
-  while (Next) {
-    if (DT->dominates(Next->BB, BB)) {
-      if (isa<Constant>(Next->Val)) return Next->Val;
-      if (!Val) Val = Next->Val;
+  for (const auto &Entry : Leaders) {
+    if (DT->dominates(Entry.BB, BB)) {
+      Val = Entry.Val;
+      if (isa<Constant>(Val))
+        return Val;
     }
-
-    Next = Next->Next;
   }
 
   return Val;
@@ -2452,7 +2507,7 @@ bool GVNPass::propagateEquality(Value *LHS, Value *RHS,
     // have the simple case where the edge dominates the end.
     if (RootDominatesEnd && !isa<Instruction>(RHS) &&
         canReplacePointersIfEqual(LHS, RHS, DL))
-      addToLeaderTable(LVN, RHS, Root.getEnd());
+      LeaderTable.insert(LVN, RHS, Root.getEnd());
 
     // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope.  As
     // LHS always has at least one use that is not dominated by Root, this will
@@ -2546,7 +2601,7 @@ bool GVNPass::propagateEquality(Value *LHS, Value *RHS,
       // The leader table only tracks basic blocks, not edges. Only add to if we
       // have the simple case where the edge dominates the end.
       if (RootDominatesEnd)
-        addToLeaderTable(Num, NotVal, Root.getEnd());
+        LeaderTable.insert(Num, NotVal, Root.getEnd());
 
       continue;
     }
@@ -2596,7 +2651,7 @@ bool GVNPass::processInstruction(Instruction *I) {
       return true;
 
     unsigned Num = VN.lookupOrAdd(Load);
-    addToLeaderTable(Num, Load, Load->getParent());
+    LeaderTable.insert(Num, Load, Load->getParent());
     return false;
   }
 
@@ -2664,7 +2719,7 @@ bool GVNPass::processInstruction(Instruction *I) {
   // Allocations are always uniquely numbered, so we can save time and memory
   // by fast failing them.
   if (isa<AllocaInst>(I) || I->isTerminator() || isa<PHINode>(I)) {
-    addToLeaderTable(Num, I, I->getParent());
+    LeaderTable.insert(Num, I, I->getParent());
     return false;
   }
 
@@ -2672,7 +2727,7 @@ bool GVNPass::processInstruction(Instruction *I) {
   // need to do a lookup to see if the number already exists
   // somewhere in the domtree: it can't!
   if (Num >= NextNum) {
-    addToLeaderTable(Num, I, I->getParent());
+    LeaderTable.insert(Num, I, I->getParent());
     return false;
   }
 
@@ -2681,7 +2736,7 @@ bool GVNPass::processInstruction(Instruction *I) {
   Value *Repl = findLeader(I->getParent(), Num);
   if (!Repl) {
     // Failure, just remember this instance for future use.
-    addToLeaderTable(Num, I, I->getParent());
+    LeaderTable.insert(Num, I, I->getParent());
     return false;
   }
 
@@ -2876,7 +2931,7 @@ bool GVNPass::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
   VN.add(Instr, Num);
 
   // Update the availability map to include the new instruction.
-  addToLeaderTable(Num, Instr, Pred);
+  LeaderTable.insert(Num, Instr, Pred);
   return true;
 }
 
@@ -3027,13 +3082,13 @@ bool GVNPass::performScalarPRE(Instruction *CurInst) {
   // After creating a new PHI for ValNo, the phi translate result for ValNo will
   // be changed, so erase the related stale entries in phi translate cache.
   VN.eraseTranslateCacheEntry(ValNo, *CurrentBlock);
-  addToLeaderTable(ValNo, Phi, CurrentBlock);
+  LeaderTable.insert(ValNo, Phi, CurrentBlock);
   Phi->setDebugLoc(CurInst->getDebugLoc());
   CurInst->replaceAllUsesWith(Phi);
   if (MD && Phi->getType()->isPtrOrPtrVectorTy())
     MD->invalidateCachedPointerInfo(Phi);
   VN.erase(CurInst);
-  removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
+  LeaderTable.erase(ValNo, CurInst, CurrentBlock);
 
   LLVM_DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
   removeInstruction(CurInst);
@@ -3127,7 +3182,6 @@ void GVNPass::cleanupGlobalSets() {
   VN.clear();
   LeaderTable.clear();
   BlockRPONumber.clear();
-  TableAllocator.Reset();
   ICF->clear();
   InvalidBlockRPONumbers = true;
 }
@@ -3147,18 +3201,7 @@ void GVNPass::removeInstruction(Instruction *I) {
 /// internal data structures.
 void GVNPass::verifyRemoved(const Instruction *Inst) const {
   VN.verifyRemoved(Inst);
-
-  // Walk through the value number scope to make sure the instruction isn't
-  // ferreted away in it.
-  for (const auto &I : LeaderTable) {
-    const LeaderTableEntry *Node = &I.second;
-    assert(Node->Val != Inst && "Inst still in value numbering scope!");
-
-    while (Node->Next) {
-      Node = Node->Next;
-      assert(Node->Val != Inst && "Inst still in value numbering scope!");
-    }
-  }
+  LeaderTable.verifyRemoved(Inst);
 }
 
 /// BB is declared dead, which implied other blocks become dead as well. This
@@ -3285,7 +3328,7 @@ void GVNPass::assignValNumForDeadCode() {
   for (BasicBlock *BB : DeadBlocks) {
     for (Instruction &Inst : *BB) {
       unsigned ValNum = VN.lookupOrAdd(&Inst);
-      addToLeaderTable(ValNum, &Inst, BB);
+      LeaderTable.insert(ValNum, &Inst, BB);
     }
   }
 }
diff --git a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 9df28747570c..104e8ceb7967 100644
--- a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -279,6 +279,9 @@ bool InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
   Value *LHS = ICI->getOperand(0);
   Value *RHS = ICI->getOperand(1);
 
+  if (!LHS->getType()->isIntegerTy())
+    return false;
+
   // Canonicalize to the `Index Pred Invariant` comparison
   if (IsLoopInvariant(LHS)) {
     std::swap(LHS, RHS);
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index ffcb511e6a83..08d82fa66da3 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1876,7 +1876,7 @@ bool JumpThreadingPass::processBranchOnXOR(BinaryOperator *BO) {
 static void addPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
                                             BasicBlock *OldPred,
                                             BasicBlock *NewPred,
-                                     DenseMap<Instruction*, Value*> &ValueMap) {
+                                            ValueToValueMapTy &ValueMap) {
   for (PHINode &PN : PHIBB->phis()) {
     // Ok, we have a PHI node.  Figure out what the incoming value was for the
     // DestBlock.
@@ -1884,7 +1884,7 @@ static void addPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
 
     // Remap the value if necessary.
     if (Instruction *Inst = dyn_cast<Instruction>(IV)) {
-      DenseMap<Instruction*, Value*>::iterator I = ValueMap.find(Inst);
+      ValueToValueMapTy::iterator I = ValueMap.find(Inst);
       if (I != ValueMap.end())
         IV = I->second;
     }
@@ -1945,9 +1945,8 @@ bool JumpThreadingPass::maybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB) {
 
 /// Update the SSA form.  NewBB contains instructions that are copied from BB.
 /// ValueMapping maps old values in BB to new ones in NewBB.
-void JumpThreadingPass::updateSSA(
-    BasicBlock *BB, BasicBlock *NewBB,
-    DenseMap<Instruction *, Value *> &ValueMapping) {
+void JumpThreadingPass::updateSSA(BasicBlock *BB, BasicBlock *NewBB,
+                                  ValueToValueMapTy &ValueMapping) {
   // If there were values defined in BB that are used outside the block, then we
   // now have to update all uses of the value to use either the original value,
   // the cloned value, or some PHI derived value.  This can require arbitrary
@@ -2008,14 +2007,15 @@ void JumpThreadingPass::updateSSA(
 /// Clone instructions in range [BI, BE) to NewBB.  For PHI nodes, we only clone
 /// arguments that come from PredBB.  Return the map from the variables in the
 /// source basic block to the variables in the newly created basic block.
-DenseMap<Instruction *, Value *>
-JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
-                                     BasicBlock::iterator BE, BasicBlock *NewBB,
-                                     BasicBlock *PredBB) {
+
+void JumpThreadingPass::cloneInstructions(ValueToValueMapTy &ValueMapping,
+                                          BasicBlock::iterator BI,
+                                          BasicBlock::iterator BE,
+                                          BasicBlock *NewBB,
+                                          BasicBlock *PredBB) {
   // We are going to have to map operands from the source basic block to the new
   // copy of the block 'NewBB'.  If there are PHI nodes in the source basic
   // block, evaluate them to account for entry from PredBB.
-  DenseMap<Instruction *, Value *> ValueMapping;
 
   // Retargets llvm.dbg.value to any renamed variables.
   auto RetargetDbgValueIfPossible = [&](Instruction *NewInst) -> bool {
@@ -2103,7 +2103,7 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
     // Remap operands to patch up intra-block references.
     for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
       if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
-        DenseMap<Instruction *, Value *>::iterator I = ValueMapping.find(Inst);
+        ValueToValueMapTy::iterator I = ValueMapping.find(Inst);
         if (I != ValueMapping.end())
           New->setOperand(i, I->second);
       }
@@ -2120,7 +2120,7 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
       RetargetDbgVariableRecordIfPossible(&DVR);
   }
 
-  return ValueMapping;
+  return;
 }
 
 /// Attempt to thread through two successive basic blocks.
@@ -2295,8 +2295,9 @@ void JumpThreadingPass::threadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
   // We are going to have to map operands from the original BB block to the new
   // copy of the block 'NewBB'.  If there are PHI nodes in PredBB, evaluate them
   // to account for entry from PredPredBB.
-  DenseMap<Instruction *, Value *> ValueMapping =
-      cloneInstructions(PredBB->begin(), PredBB->end(), NewBB, PredPredBB);
+  ValueToValueMapTy ValueMapping;
+  cloneInstructions(ValueMapping, PredBB->begin(), PredBB->end(), NewBB,
+                    PredPredBB);
 
   // Copy the edge probabilities from PredBB to NewBB.
   if (BPI)
@@ -2419,8 +2420,9 @@ void JumpThreadingPass::threadEdge(BasicBlock *BB,
   }
 
   // Copy all the instructions from BB to NewBB except the terminator.
-  DenseMap<Instruction *, Value *> ValueMapping =
-      cloneInstructions(BB->begin(), std::prev(BB->end()), NewBB, PredBB);
+  ValueToValueMapTy ValueMapping;
+  cloneInstructions(ValueMapping, BB->begin(), std::prev(BB->end()), NewBB,
+                    PredBB);
 
   // We didn't copy the terminator from BB over to NewBB, because there is now
   // an unconditional jump to SuccBB.  Insert the unconditional jump.
@@ -2675,7 +2677,7 @@ bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred(
 
   // We are going to have to map operands from the original BB block into the
   // PredBB block.  Evaluate PHI nodes in BB.
-  DenseMap<Instruction*, Value*> ValueMapping;
+  ValueToValueMapTy ValueMapping;
 
   BasicBlock::iterator BI = BB->begin();
   for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
@@ -2689,11 +2691,14 @@ bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred(
     // Remap operands to patch up intra-block references.
     for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
       if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
-        DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst);
+        ValueToValueMapTy::iterator I = ValueMapping.find(Inst);
         if (I != ValueMapping.end())
           New->setOperand(i, I->second);
       }
 
+    // Remap debug variable operands.
+    remapDebugVariable(ValueMapping, New);
+
     // If this instruction can be simplified after the operands are updated,
     // just use the simplified value instead.  This frequently happens due to
     // phi translation.
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 1036b8ae963a..7ef5dceffec0 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -99,7 +99,7 @@ struct MemsetRange {
   MaybeAlign Alignment;
 
   /// TheStores - The actual stores that make up this range.
-  SmallVector<Instruction*, 16> TheStores;
+  SmallVector<Instruction *, 16> TheStores;
 
   bool isProfitableToUseMemset(const DataLayout &DL) const;
 };
@@ -108,10 +108,12 @@ struct MemsetRange {
 
 bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
   // If we found more than 4 stores to merge or 16 bytes, use memset.
-  if (TheStores.size() >= 4 || End-Start >= 16) return true;
+  if (TheStores.size() >= 4 || End - Start >= 16)
+    return true;
 
   // If there is nothing to merge, don't do anything.
-  if (TheStores.size() < 2) return false;
+  if (TheStores.size() < 2)
+    return false;
 
   // If any of the stores are a memset, then it is always good to extend the
   // memset.
@@ -121,7 +123,8 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
 
   // Assume that the code generator is capable of merging pairs of stores
   // together if it wants to.
-  if (TheStores.size() == 2) return false;
+  if (TheStores.size() == 2)
+    return false;
 
   // If we have fewer than 8 stores, it can still be worthwhile to do this.
   // For example, merging 4 i8 stores into an i32 store is useful almost always.
@@ -133,7 +136,7 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
   // the maximum GPR width is the same size as the largest legal integer
   // size. If so, check to see whether we will end up actually reducing the
   // number of stores used.
-  unsigned Bytes = unsigned(End-Start);
+  unsigned Bytes = unsigned(End - Start);
   unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8;
   if (MaxIntSize == 0)
     MaxIntSize = 1;
@@ -145,7 +148,7 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
   // If we will reduce the # stores (according to this heuristic), do the
   // transformation.  This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
   // etc.
-  return TheStores.size() > NumPointerStores+NumByteStores;
+  return TheStores.size() > NumPointerStores + NumByteStores;
 }
 
 namespace {
@@ -197,7 +200,7 @@ public:
 /// existing ranges as appropriate.
 void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
                             MaybeAlign Alignment, Instruction *Inst) {
-  int64_t End = Start+Size;
+  int64_t End = Start + Size;
 
   range_iterator I = partition_point(
       Ranges, [=](const MemsetRange &O) { return O.End < Start; });
@@ -207,10 +210,10 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
   // to insert a new range.  Handle this now.
   if (I == Ranges.end() || End < I->Start) {
     MemsetRange &R = *Ranges.insert(I, MemsetRange());
-    R.Start        = Start;
-    R.End          = End;
-    R.StartPtr     = Ptr;
-    R.Alignment    = Alignment;
+    R.Start = Start;
+    R.End = End;
+    R.StartPtr = Ptr;
+    R.Alignment = Alignment;
     R.TheStores.push_back(Inst);
     return;
   }
@@ -397,7 +400,8 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
 
     if (auto *NextStore = dyn_cast<StoreInst>(BI)) {
       // If this is a store, see if we can merge it in.
-      if (!NextStore->isSimple()) break;
+      if (!NextStore->isSimple())
+        break;
 
       Value *StoredVal = NextStore->getValueOperand();
 
@@ -460,7 +464,8 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
   // emit memset's for anything big enough to be worthwhile.
   Instruction *AMemSet = nullptr;
   for (const MemsetRange &Range : Ranges) {
-    if (Range.TheStores.size() == 1) continue;
+    if (Range.TheStores.size() == 1)
+      continue;
 
     // If it is profitable to lower this range to memset, do so now.
     if (!Range.isProfitableToUseMemset(DL))
@@ -481,12 +486,10 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
     if (!Range.TheStores.empty())
       AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
 
-    auto *NewDef =
-        cast<MemoryDef>(MemInsertPoint->getMemoryInst() == &*BI
-                            ? MSSAU->createMemoryAccessBefore(
-                                  AMemSet, nullptr, MemInsertPoint)
-                            : MSSAU->createMemoryAccessAfter(
-                                  AMemSet, nullptr, MemInsertPoint));
+    auto *NewDef = cast<MemoryDef>(
+        MemInsertPoint->getMemoryInst() == &*BI
+            ? MSSAU->createMemoryAccessBefore(AMemSet, nullptr, MemInsertPoint)
+            : MSSAU->createMemoryAccessAfter(AMemSet, nullptr, MemInsertPoint));
     MSSAU->insertDef(NewDef, /*RenameUses=*/true);
     MemInsertPoint = NewDef;
 
@@ -512,12 +515,13 @@ bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) {
 
   // Keep track of the arguments of all instruction we plan to lift
   // so we can make sure to lift them as well if appropriate.
-  DenseSet<Instruction*> Args;
+  DenseSet<Instruction *> Args;
   auto AddArg = [&](Value *Arg) {
     auto *I = dyn_cast<Instruction>(Arg);
     if (I && I->getParent() == SI->getParent()) {
       // Cannot hoist user of P above P
-      if (I == P) return false;
+      if (I == P)
+        return false;
       Args.insert(I);
     }
     return true;
@@ -630,8 +634,7 @@ bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) {
 bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
                                        const DataLayout &DL,
                                        BasicBlock::iterator &BBI) {
-  if (!LI->isSimple() || !LI->hasOneUse() ||
-      LI->getParent() != SI->getParent())
+  if (!LI->isSimple() || !LI->hasOneUse() || LI->getParent() != SI->getParent())
     return false;
 
   auto *T = LI->getType();
@@ -678,21 +681,20 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
         UseMemMove = true;
 
       IRBuilder<> Builder(P);
-      Value *Size = Builder.CreateTypeSize(Builder.getInt64Ty(),
-                                           DL.getTypeStoreSize(T));
+      Value *Size =
+          Builder.CreateTypeSize(Builder.getInt64Ty(), DL.getTypeStoreSize(T));
       Instruction *M;
       if (UseMemMove)
-        M = Builder.CreateMemMove(
-            SI->getPointerOperand(), SI->getAlign(),
-            LI->getPointerOperand(), LI->getAlign(), Size);
+        M = Builder.CreateMemMove(SI->getPointerOperand(), SI->getAlign(),
+                                  LI->getPointerOperand(), LI->getAlign(),
+                                  Size);
       else
-        M = Builder.CreateMemCpy(
-            SI->getPointerOperand(), SI->getAlign(),
-            LI->getPointerOperand(), LI->getAlign(), Size);
+        M = Builder.CreateMemCpy(SI->getPointerOperand(), SI->getAlign(),
+                                 LI->getPointerOperand(), LI->getAlign(), Size);
       M->copyMetadata(*SI, LLVMContext::MD_DIAssignID);
 
-      LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => "
-                        << *M << "\n");
+      LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => " << *M
+                        << "\n");
 
       auto *LastDef =
           cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI));
@@ -755,7 +757,8 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
 }
 
 bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
-  if (!SI->isSimple()) return false;
+  if (!SI->isSimple())
+    return false;
 
   // Avoid merging nontemporal stores since the resulting
   // memcpy/memset would not be able to preserve the nontemporal hint.
@@ -794,8 +797,8 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   // 0xA0A0A0A0 and 0.0.
   auto *V = SI->getOperand(0);
   if (Value *ByteVal = isBytewiseValue(V, DL)) {
-    if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(),
-                                              ByteVal)) {
+    if (Instruction *I =
+            tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) {
       BBI = I->getIterator(); // Don't invalidate iterator.
       return true;
     }
@@ -816,8 +819,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
       // The newly inserted memset is immediately overwritten by the original
       // store, so we do not need to rename uses.
       auto *StoreDef = cast<MemoryDef>(MSSA->getMemoryAccess(SI));
-      auto *NewAccess = MSSAU->createMemoryAccessBefore(
-          M, nullptr, StoreDef);
+      auto *NewAccess = MSSAU->createMemoryAccessBefore(M, nullptr, StoreDef);
       MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/false);
 
       eraseInstruction(SI);
@@ -836,8 +838,8 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
   // See if there is another memset or store neighboring this memset which
   // allows us to widen out the memset to do a single larger store.
   if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile())
-    if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(),
-                                              MSI->getValue())) {
+    if (Instruction *I =
+            tryMergingIntoMemset(MSI, MSI->getDest(), MSI->getValue())) {
       BBI = I->getIterator(); // Don't invalidate iterator.
       return true;
     }
@@ -850,7 +852,8 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
 bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
                                          Instruction *cpyStore, Value *cpyDest,
                                          Value *cpySrc, TypeSize cpySize,
-                                         Align cpyDestAlign, BatchAAResults &BAA,
+                                         Align cpyDestAlign,
+                                         BatchAAResults &BAA,
                                          std::function<CallInst *()> GetC) {
   // The general transformation to keep in mind is
   //
@@ -898,15 +901,15 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
     if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
       return false;
 
-
   if (C->getParent() != cpyStore->getParent()) {
     LLVM_DEBUG(dbgs() << "Call Slot: block local restriction\n");
     return false;
   }
 
-  MemoryLocation DestLoc = isa<StoreInst>(cpyStore) ?
-    MemoryLocation::get(cpyStore) :
-    MemoryLocation::getForDest(cast<MemCpyInst>(cpyStore));
+  MemoryLocation DestLoc =
+      isa<StoreInst>(cpyStore)
+          ? MemoryLocation::get(cpyStore)
+          : MemoryLocation::getForDest(cast<MemCpyInst>(cpyStore));
 
   // Check that nothing touches the dest of the copy between
   // the call and the store/memcpy.
@@ -1175,7 +1178,8 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
 
   // If all checks passed, then we can transform M.
   LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy->memcpy src:\n"
-                    << *MDep << '\n' << *M << '\n');
+                    << *MDep << '\n'
+                    << *M << '\n');
 
   // TODO: Is this worth it if we're creating a less aligned memcpy? For
   // example we could be moving from movaps -> movq on x86.
@@ -1307,8 +1311,8 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
   // memcpy's defining access is the memset about to be removed.
   auto *LastDef =
       cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy));
-  auto *NewAccess = MSSAU->createMemoryAccessBefore(
-      NewMemSet, nullptr, LastDef);
+  auto *NewAccess =
+      MSSAU->createMemoryAccessBefore(NewMemSet, nullptr, LastDef);
   MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
 
   eraseInstruction(MemSet);
@@ -1384,7 +1388,7 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
       return false;
 
     // A known memcpy size is also required.
-    auto  *CCopySize = dyn_cast<ConstantInt>(CopySize);
+    auto *CCopySize = dyn_cast<ConstantInt>(CopySize);
     if (!CCopySize)
       return false;
     if (CCopySize->getZExtValue() > CMemSetSize->getZExtValue()) {
@@ -1655,7 +1659,8 @@ static bool isZeroSize(Value *Size) {
 /// altogether.
 bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
   // We can only optimize non-volatile memcpy's.
-  if (M->isVolatile()) return false;
+  if (M->isVolatile())
+    return false;
 
   // If the source and destination of the memcpy are the same, then zap it.
   if (M->getSource() == M->getDest()) {
@@ -1796,11 +1801,10 @@ bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
                     << "\n");
 
   // If not, then we know we can transform this.
-  Type *ArgTys[3] = { M->getRawDest()->getType(),
-                      M->getRawSource()->getType(),
-                      M->getLength()->getType() };
-  M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(),
-                                                 Intrinsic::memcpy, ArgTys));
+  Type *ArgTys[3] = {M->getRawDest()->getType(), M->getRawSource()->getType(),
+                     M->getLength()->getType()};
+  M->setCalledFunction(
+      Intrinsic::getDeclaration(M->getModule(), Intrinsic::memcpy, ArgTys));
 
   // For MemorySSA nothing really changes (except that memcpy may imply stricter
   // aliasing guarantees).
@@ -1843,7 +1847,8 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
   // Get the alignment of the byval.  If the call doesn't specify the alignment,
   // then it is some target specific value that we can't know.
   MaybeAlign ByValAlign = CB.getParamAlign(ArgNo);
-  if (!ByValAlign) return false;
+  if (!ByValAlign)
+    return false;
 
   // If it is greater than the memcpy, then we check to see if we can force the
   // source of the memcpy to the alignment we need.  If we fail, we bail out.
@@ -1987,7 +1992,7 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) {
       continue;
 
     for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
-        // Avoid invalidating the iterator.
+      // Avoid invalidating the iterator.
       Instruction *I = &*BI++;
 
       bool RepeatInstruction = false;
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 5396038d8b92..51fc28ef90ef 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -333,6 +333,10 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
   // Finally, erase the old block and update dominator info.
   DeleteDeadBlock(BB, DTU);
 
+  // Remove redundant "llvm.dbg" instrunctions after blocks have been merged.
+  if (PredBB->getParent()->getSubprogram())
+    RemoveRedundantDbgInstrs(PredBB);
+
   return true;
 }
 
@@ -1401,13 +1405,13 @@ SplitBlockPredecessorsImpl(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
   if (OldLatch) {
     BasicBlock *NewLatch = L->getLoopLatch();
     if (NewLatch != OldLatch) {
-      MDNode *MD = OldLatch->getTerminator()->getMetadata("llvm.loop");
-      NewLatch->getTerminator()->setMetadata("llvm.loop", MD);
+      MDNode *MD = OldLatch->getTerminator()->getMetadata(LLVMContext::MD_loop);
+      NewLatch->getTerminator()->setMetadata(LLVMContext::MD_loop, MD);
       // It's still possible that OldLatch is the latch of another inner loop,
       // in which case we do not remove the metadata.
       Loop *IL = LI->getLoopFor(OldLatch);
       if (IL && IL->getLoopLatch() != OldLatch)
-        OldLatch->getTerminator()->setMetadata("llvm.loop", nullptr);
+        OldLatch->getTerminator()->setMetadata(LLVMContext::MD_loop, nullptr);
     }
   }
 
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 3eac726994ae..303a09805a9d 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -1131,6 +1131,9 @@ BasicBlock *llvm::DuplicateInstructionsInSplitBetween(
         if (I != ValueMapping.end())
           New->setOperand(i, I->second);
       }
+
+    // Remap debug variable operands.
+    remapDebugVariable(ValueMapping, New);
   }
 
   return NewBB;
diff --git a/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/llvm/lib/Transforms/Utils/GlobalStatus.cpp
index c5aded3c45f4..b177e048faae 100644
--- a/llvm/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/llvm/lib/Transforms/Utils/GlobalStatus.cpp
@@ -172,9 +172,14 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
           return true;
         GS.StoredType = GlobalStatus::Stored;
       } else if (const auto *CB = dyn_cast<CallBase>(I)) {
-        if (!CB->isCallee(&U))
-          return true;
-        GS.IsLoaded = true;
+        if (CB->getIntrinsicID() == Intrinsic::threadlocal_address) {
+          if (analyzeGlobalAux(I, GS, VisitedUsers))
+            return true;
+        } else {
+          if (!CB->isCallee(&U))
+            return true;
+          GS.IsLoaded = true;
+        }
       } else {
         return true; // Any other non-load instruction might take address!
       }
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 5f456092bf4e..f3cd3104c312 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3685,6 +3685,30 @@ DIExpression *llvm::getExpressionForConstant(DIBuilder &DIB, const Constant &C,
   return nullptr;
 }
 
+void llvm::remapDebugVariable(ValueToValueMapTy &Mapping, Instruction *Inst) {
+  auto RemapDebugOperands = [&Mapping](auto *DV, auto Set) {
+    for (auto *Op : Set) {
+      auto I = Mapping.find(Op);
+      if (I != Mapping.end())
+        DV->replaceVariableLocationOp(Op, I->second, /*AllowEmpty=*/true);
+    }
+  };
+  auto RemapAssignAddress = [&Mapping](auto *DA) {
+    auto I = Mapping.find(DA->getAddress());
+    if (I != Mapping.end())
+      DA->setAddress(I->second);
+  };
+  if (auto DVI = dyn_cast<DbgVariableIntrinsic>(Inst))
+    RemapDebugOperands(DVI, DVI->location_ops());
+  if (auto DAI = dyn_cast<DbgAssignIntrinsic>(Inst))
+    RemapAssignAddress(DAI);
+  for (DbgVariableRecord &DVR : filterDbgVars(Inst->getDbgRecordRange())) {
+    RemapDebugOperands(&DVR, DVR.location_ops());
+    if (DVR.isDbgAssign())
+      RemapAssignAddress(&DVR);
+  }
+}
+
 namespace {
 
 /// A potential constituent of a bitreverse or bswap expression. See
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 73c5d6367822..e3e09d11ba8c 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1930,10 +1930,12 @@ llvm::hasPartialIVCondition(const Loop &L, unsigned MSSAThreshold,
   if (!TI || !TI->isConditional())
     return {};
 
-  auto *CondI = dyn_cast<CmpInst>(TI->getCondition());
+  auto *CondI = dyn_cast<Instruction>(TI->getCondition());
   // The case with the condition outside the loop should already be handled
   // earlier.
-  if (!CondI || !L.contains(CondI))
+  // Allow CmpInst and TruncInsts as they may be users of load instructions
+  // and have potential for partial unswitching
+  if (!CondI || !isa<CmpInst, TruncInst>(CondI) || !L.contains(CondI))
     return {};
 
   SmallVector<Instruction *> InstToDuplicate;
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 7b1eb70168d8..0464ba5e1811 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/StackSafetyAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -283,5 +284,37 @@ Value *getAndroidSlotPtr(IRBuilder<> &IRB, int Slot) {
                                 IRB.CreateCall(ThreadPointerFunc), 8 * Slot);
 }
 
+static DbgAssignIntrinsic *DynCastToDbgAssign(DbgVariableIntrinsic *DVI) {
+  return dyn_cast<DbgAssignIntrinsic>(DVI);
+}
+
+static DbgVariableRecord *DynCastToDbgAssign(DbgVariableRecord *DVR) {
+  return DVR->isDbgAssign() ? DVR : nullptr;
+}
+
+void annotateDebugRecords(AllocaInfo &Info, unsigned int Tag) {
+  // Helper utility for adding DW_OP_LLVM_tag_offset to debug-info records,
+  // abstracted over whether they're intrinsic-stored or DbgVariableRecord
+  // stored.
+  auto AnnotateDbgRecord = [&](auto *DPtr) {
+    // Prepend "tag_offset, N" to the dwarf expression.
+    // Tag offset logically applies to the alloca pointer, and it makes sense
+    // to put it at the beginning of the expression.
+    SmallVector<uint64_t, 8> NewOps = {dwarf::DW_OP_LLVM_tag_offset, Tag};
+    for (size_t LocNo = 0; LocNo < DPtr->getNumVariableLocationOps(); ++LocNo)
+      if (DPtr->getVariableLocationOp(LocNo) == Info.AI)
+        DPtr->setExpression(
+            DIExpression::appendOpsToArg(DPtr->getExpression(), NewOps, LocNo));
+    if (auto *DAI = DynCastToDbgAssign(DPtr)) {
+      if (DAI->getAddress() == Info.AI)
+        DAI->setAddressExpression(
+            DIExpression::prependOpcodes(DAI->getAddressExpression(), NewOps));
+    }
+  };
+
+  llvm::for_each(Info.DbgVariableIntrinsics, AnnotateDbgRecord);
+  llvm::for_each(Info.DbgVariableRecords, AnnotateDbgRecord);
+}
+
 } // namespace memtag
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 33c4decd58a6..c44d90f0998e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -460,9 +460,9 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
   // must use intrinsics to interleave.
   if (VecTy->isScalableTy()) {
     VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
-    return Builder.CreateIntrinsic(
-        WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
-        /*FMFSource=*/nullptr, Name);
+    return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
+                                   Vals,
+                                   /*FMFSource=*/nullptr, Name);
   }
 
   // Fixed length. Start by concatenating all vectors into a wide vector.
@@ -2517,9 +2517,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
       SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
       auto *MaskTy =
           VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true);
-      return Builder.CreateIntrinsic(
-          MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
-          /*FMFSource=*/nullptr, "interleaved.mask");
+      return Builder.CreateIntrinsic(MaskTy, Intrinsic::vector_interleave2, Ops,
+                                     /*FMFSource=*/nullptr, "interleaved.mask");
     }
 
     if (!BlockInMask)
@@ -2571,7 +2570,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
         // Scalable vectors cannot use arbitrary shufflevectors (only splats),
         // so must use intrinsics to deinterleave.
         Value *DI = Builder.CreateIntrinsic(
-            Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
+            Intrinsic::vector_deinterleave2, VecTy, NewLoads[Part],
             /*FMFSource=*/nullptr, "strided.vec");
         unsigned J = 0;
         for (unsigned I = 0; I < InterleaveFactor; ++I) {
@@ -4167,7 +4166,6 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
 
   // Worklist containing uniform instructions demanding lane 0.
   SetVector<Instruction *> Worklist;
-  BasicBlock *Latch = TheLoop->getLoopLatch();
 
   // Add uniform instructions demanding lane 0 to the worklist. Instructions
   // that are scalar with predication must not be considered uniform after
@@ -4189,12 +4187,16 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
     Worklist.insert(I);
   };
 
-  // Start with the conditional branch. If the branch condition is an
-  // instruction contained in the loop that is only used by the branch, it is
-  // uniform.
-  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
-  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
-    addToWorklistIfAllowed(Cmp);
+  // Start with the conditional branches exiting the loop. If the branch
+  // condition is an instruction contained in the loop that is only used by the
+  // branch, it is uniform.
+  SmallVector<BasicBlock *> Exiting;
+  TheLoop->getExitingBlocks(Exiting);
+  for (BasicBlock *E : Exiting) {
+    auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
+    if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
+      addToWorklistIfAllowed(Cmp);
+  }
 
   auto PrevVF = VF.divideCoefficientBy(2);
   // Return true if all lanes perform the same memory operation, and we can
@@ -4335,6 +4337,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   // nodes separately. An induction variable will remain uniform if all users
   // of the induction variable and induction variable update remain uniform.
   // The code below handles both pointer and non-pointer induction variables.
+  BasicBlock *Latch = TheLoop->getLoopLatch();
   for (const auto &Induction : Legal->getInductionVars()) {
     auto *Ind = Induction.first;
     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
@@ -6873,11 +6876,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
     // In cases of scalarized and predicated instructions, there will be VF
     // predicated blocks in the vectorized loop. Each branch around these
     // blocks requires also an extract of its vector compare i1 element.
+    // Note that the conditional branch from the loop latch will be replaced by
+    // a single branch controlling the loop, so there is no extra overhead from
+    // scalarization.
     bool ScalarPredicatedBB = false;
     BranchInst *BI = cast<BranchInst>(I);
     if (VF.isVector() && BI->isConditional() &&
         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
-         PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
+         PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
+        BI->getParent() != TheLoop->getLoopLatch())
       ScalarPredicatedBB = true;
 
     if (ScalarPredicatedBB) {
@@ -8817,12 +8824,24 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     // Only handle constant strides for now.
     if (!ScevStride)
       continue;
-    Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt());
 
-    auto *ConstVPV = Plan->getOrAddLiveIn(CI);
-    // The versioned value may not be used in the loop directly, so just add a
-    // new live-in in those cases.
-    Plan->getOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV);
+    auto *CI = Plan->getOrAddLiveIn(
+        ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
+    if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))
+      StrideVPV->replaceAllUsesWith(CI);
+
+    // The versioned value may not be used in the loop directly but through a
+    // sext/zext. Add new live-ins in those cases.
+    for (Value *U : StrideV->users()) {
+      if (!isa<SExtInst, ZExtInst>(U))
+        continue;
+      VPValue *StrideVPV = Plan->getLiveIn(U);
+      if (!StrideVPV)
+        continue;
+      VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(
+          U->getType(), ScevStride->getAPInt().getSExtValue()));
+      StrideVPV->replaceAllUsesWith(CI);
+    }
   }
 
   VPlanTransforms::dropPoisonGeneratingRecipes(*Plan, [this](BasicBlock *BB) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a1a28076881c..e3a1b0d39a4d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1135,6 +1135,7 @@ public:
     ScalarToTreeEntry.clear();
     MultiNodeScalars.clear();
     MustGather.clear();
+    NonScheduledFirst.clear();
     EntryToLastInstruction.clear();
     ExternalUses.clear();
     ExternalUsesAsGEPs.clear();
@@ -1252,7 +1253,7 @@ public:
   /// effectively impossible for the backend to undo.
   /// TODO: If load combining is allowed in the IR optimizer, this analysis
   ///       may not be necessary.
-  bool isLoadCombineCandidate() const;
+  bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
 
   /// Checks if the given array of loads can be represented as a vectorized,
   /// scatter or just simple gather.
@@ -2356,6 +2357,14 @@ public:
   bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
     return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
   }
+  /// Checks if the given value is gathered in one of the nodes.
+  bool isGathered(const Value *V) const {
+    return MustGather.contains(V);
+  }
+  /// Checks if the specified value was not schedule.
+  bool isNotScheduled(const Value *V) const {
+    return NonScheduledFirst.contains(V);
+  }
 
   /// Check if the value is vectorized in the tree.
   bool isVectorized(Value *V) const { return getTreeEntry(V); }
@@ -2478,12 +2487,12 @@ private:
   /// which exploits values reused across lanes, and arranges the inserts
   /// for ease of later optimization.
   template <typename BVTy, typename ResTy, typename... Args>
-  ResTy processBuildVector(const TreeEntry *E, Args &...Params);
+  ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
 
   /// Create a new vector from a list of scalar values.  Produces a sequence
   /// which exploits values reused across lanes, and arranges the inserts
   /// for ease of later optimization.
-  Value *createBuildVector(const TreeEntry *E);
+  Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
 
   /// Returns the instruction in the bundle, which can be used as a base point
   /// for scheduling. Usually it is the last instruction in the bundle, except
@@ -2547,7 +2556,8 @@ private:
   /// this subtree gets vectorized, we may need to extract the values from the
   /// roots. This method calculates the cost of extracting the values.
   /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
-  InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc) const;
+  InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
+                                Type *ScalarTy) const;
 
   /// Set the Builder insert point to one after the last instruction in
   /// the bundle
@@ -2555,7 +2565,7 @@ private:
 
   /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
   /// specified, the starting vector value is poison.
-  Value *gather(ArrayRef<Value *> VL, Value *Root);
+  Value *gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy);
 
   /// \returns whether the VectorizableTree is fully vectorizable and will
   /// be beneficial even the tree height is tiny.
@@ -3071,6 +3081,9 @@ private:
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
 
+  /// A set of first non-schedulable values.
+  ValueSet NonScheduledFirst;
+
   /// A map between the vectorized entries and the last instructions in the
   /// bundles. The bundles are built in use order, not in the def order of the
   /// instructions. So, we cannot rely directly on the last instruction in the
@@ -6646,6 +6659,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
            "tryScheduleBundle should cancelScheduling on failure");
     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
                  ReuseShuffleIndicies);
+    NonScheduledFirst.insert(VL.front());
     return;
   }
   LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@@ -7863,6 +7877,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
   bool IsFinalized = false;
   SmallVector<int> CommonMask;
   SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;
+  Type *ScalarTy = nullptr;
   const TargetTransformInfo &TTI;
   InstructionCost Cost = 0;
   SmallDenseSet<Value *> VectorizedVals;
@@ -7892,13 +7907,13 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
   InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
     if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
       return TTI::TCC_Free;
-    auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
+    auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
     InstructionCost GatherCost = 0;
     SmallVector<Value *> Gathers(VL.begin(), VL.end());
     // Improve gather cost for gather of loads, if we can group some of the
     // loads into vector loads.
     InstructionsState S = getSameOpcode(VL, *R.TLI);
-    const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType());
+    const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy);
     unsigned MinVF = R.getMinVF(2 * Sz);
     if (VL.size() > 2 &&
         ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
@@ -7912,7 +7927,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                  }))) &&
         !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
         !isSplat(Gathers)) {
-      InstructionCost BaseCost = R.getGatherCost(Gathers, !Root);
+      InstructionCost BaseCost = R.getGatherCost(Gathers, !Root, ScalarTy);
       SetVector<Value *> VectorizedLoads;
       SmallVector<std::pair<unsigned, LoadsState>> VectorizedStarts;
       SmallVector<unsigned> ScatterVectorized;
@@ -8040,7 +8055,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                                              VecTy, Mask, CostKind);
             }
           } else {
-            GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true);
+            GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true,
+                                          PointerOps.front()->getType());
           }
         }
         if (NeedInsertSubvectorAnalysis) {
@@ -8074,18 +8090,19 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       transform(VL, ShuffleMask.begin(), [](Value *V) {
         return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
       });
-      InstructionCost InsertCost = TTI.getVectorInstrCost(
-          Instruction::InsertElement, VecTy, CostKind, 0,
-          PoisonValue::get(VecTy), *It);
-      return InsertCost +
-             TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy,
-                                ShuffleMask, CostKind, /*Index=*/0,
-                                /*SubTp=*/nullptr, /*Args=*/*It);
+      InstructionCost InsertCost =
+          TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
+                                 PoisonValue::get(VecTy), *It);
+      return InsertCost + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast,
+                                             VecTy, ShuffleMask, CostKind,
+                                             /*Index=*/0, /*SubTp=*/nullptr,
+                                             /*Args=*/*It);
     }
     return GatherCost +
            (all_of(Gathers, IsaPred<UndefValue>)
                 ? TTI::TCC_Free
-                : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));
+                : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
+                                  ScalarTy));
   };
 
   /// Compute the cost of creating a vector containing the extracted values from
@@ -8105,8 +8122,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
             return Sz;
           return std::max(Sz, VecTy->getNumElements());
         });
-    unsigned NumSrcRegs = TTI.getNumberOfParts(
-        FixedVectorType::get(VL.front()->getType(), NumElts));
+    unsigned NumSrcRegs =
+        TTI.getNumberOfParts(FixedVectorType::get(ScalarTy, NumElts));
     if (NumSrcRegs == 0)
       NumSrcRegs = 1;
     // FIXME: this must be moved to TTI for better estimation.
@@ -8152,17 +8169,16 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       std::optional<TTI::ShuffleKind> RegShuffleKind =
           CheckPerRegistersShuffle(SubMask);
       if (!RegShuffleKind) {
-        Cost += ::getShuffleCost(
-            TTI, *ShuffleKinds[Part],
-            FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
+        Cost += ::getShuffleCost(TTI, *ShuffleKinds[Part],
+                                 FixedVectorType::get(ScalarTy, NumElts),
+                                 MaskSlice);
         continue;
       }
       if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
           !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
-        Cost += ::getShuffleCost(
-            TTI, *RegShuffleKind,
-            FixedVectorType::get(VL.front()->getType(), EltsPerVector),
-            SubMask);
+        Cost += ::getShuffleCost(TTI, *RegShuffleKind,
+                                 FixedVectorType::get(ScalarTy, EltsPerVector),
+                                 SubMask);
       }
     }
     return Cost;
@@ -8279,6 +8295,48 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     SmallVector<int> CommonMask(Mask.begin(), Mask.end());
     Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
     unsigned CommonVF = Mask.size();
+    InstructionCost ExtraCost = 0;
+    auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
+                                        unsigned VF) -> InstructionCost {
+      if (E.State == TreeEntry::NeedToGather && allConstant(E.Scalars))
+        return TTI::TCC_Free;
+      Type *EScalarTy = E.Scalars.front()->getType();
+      bool IsSigned = true;
+      if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
+        EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
+        IsSigned = It->second.second;
+      }
+      if (EScalarTy != ScalarTy) {
+        unsigned CastOpcode = Instruction::Trunc;
+        unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
+        unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
+        if (DstSz > SrcSz)
+          CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
+        return TTI.getCastInstrCost(CastOpcode,
+                                    FixedVectorType::get(ScalarTy, VF),
+                                    FixedVectorType::get(EScalarTy, VF),
+                                    TTI::CastContextHint::None, CostKind);
+      }
+      return TTI::TCC_Free;
+    };
+    auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
+      if (isa<Constant>(V))
+        return TTI::TCC_Free;
+      auto *VecTy = cast<VectorType>(V->getType());
+      Type *EScalarTy = VecTy->getElementType();
+      if (EScalarTy != ScalarTy) {
+        bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
+        unsigned CastOpcode = Instruction::Trunc;
+        unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
+        unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
+        if (DstSz > SrcSz)
+          CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
+        return TTI.getCastInstrCost(
+            CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
+            VecTy, TTI::CastContextHint::None, CostKind);
+      }
+      return TTI::TCC_Free;
+    };
     if (!V1 && !V2 && !P2.isNull()) {
       // Shuffle 2 entry nodes.
       const TreeEntry *E = P1.get<const TreeEntry *>();
@@ -8305,11 +8363,14 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
           }
         }
         CommonVF = E->Scalars.size();
+        ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
+                     GetNodeMinBWAffectedCost(*E2, CommonVF);
+      } else {
+        ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
+                     GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
       }
-      V1 = Constant::getNullValue(
-          FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
-      V2 = getAllOnesValue(
-          *R.DL, FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
+      V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
+      V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
     } else if (!V1 && P2.isNull()) {
       // Shuffle single entry node.
       const TreeEntry *E = P1.get<const TreeEntry *>();
@@ -8328,8 +8389,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         }
         CommonVF = E->Scalars.size();
       }
-      V1 = Constant::getNullValue(
-          FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
+      ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
+      V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
       // Not identity/broadcast? Try to see if the original vector is better.
       if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
           CommonVF == CommonMask.size() &&
@@ -8346,6 +8407,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       }
     } else if (V1 && P2.isNull()) {
       // Shuffle single vector.
+      ExtraCost += GetValueMinBWAffectedCost(V1);
       CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
       assert(
           all_of(Mask,
@@ -8372,11 +8434,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         }
         CommonVF = VF;
       }
-      V1 = Constant::getNullValue(
-          FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
-      V2 = getAllOnesValue(
-          *R.DL,
-          FixedVectorType::get(E2->Scalars.front()->getType(), CommonVF));
+      ExtraCost += GetValueMinBWAffectedCost(V1);
+      V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
+      ExtraCost += GetNodeMinBWAffectedCost(
+          *E2, std::min(CommonVF, E2->getVectorFactor()));
+      V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
     } else if (!V1 && V2) {
       // Shuffle vector and tree node.
       unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
@@ -8400,11 +8462,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         }
         CommonVF = VF;
       }
-      V1 = Constant::getNullValue(
-          FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));
-      V2 = getAllOnesValue(
-          *R.DL,
-          FixedVectorType::get(E1->Scalars.front()->getType(), CommonVF));
+      ExtraCost += GetNodeMinBWAffectedCost(
+          *E1, std::min(CommonVF, E1->getVectorFactor()));
+      V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
+      ExtraCost += GetValueMinBWAffectedCost(V2);
+      V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
     } else {
       assert(V1 && V2 && "Expected both vectors.");
       unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
@@ -8415,30 +8477,33 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                       return Idx < 2 * static_cast<int>(CommonVF);
                     }) &&
              "All elements in mask must be less than 2 * CommonVF.");
+      ExtraCost +=
+          GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
       if (V1->getType() != V2->getType()) {
-        V1 = Constant::getNullValue(FixedVectorType::get(
-            cast<FixedVectorType>(V1->getType())->getElementType(), CommonVF));
-        V2 = getAllOnesValue(
-            *R.DL, FixedVectorType::get(
-                       cast<FixedVectorType>(V1->getType())->getElementType(),
-                       CommonVF));
+        V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
+        V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
+      } else {
+        if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
+          V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
+        if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
+          V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
       }
     }
-    InVectors.front() = Constant::getNullValue(FixedVectorType::get(
-        cast<FixedVectorType>(V1->getType())->getElementType(),
-        CommonMask.size()));
+    InVectors.front() = Constant::getNullValue(
+        FixedVectorType::get(ScalarTy, CommonMask.size()));
     if (InVectors.size() == 2)
       InVectors.pop_back();
-    return BaseShuffleAnalysis::createShuffle<InstructionCost>(
-        V1, V2, CommonMask, Builder);
+    return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
+                           V1, V2, CommonMask, Builder);
   }
 
 public:
-  ShuffleCostEstimator(TargetTransformInfo &TTI,
+  ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI,
                        ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
                        SmallPtrSetImpl<Value *> &CheckedExtracts)
-      : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),
-        R(R), CheckedExtracts(CheckedExtracts) {}
+      : ScalarTy(ScalarTy), TTI(TTI),
+        VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
+        CheckedExtracts(CheckedExtracts) {}
   Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
                         ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
                         unsigned NumParts, bool &UseVecBaseAsInput) {
@@ -8486,6 +8551,12 @@ public:
         const TreeEntry *VE = R.getTreeEntry(V);
         if (!CheckedExtracts.insert(V).second ||
             !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
+            any_of(EE->users(),
+                   [&](User *U) {
+                     return isa<GetElementPtrInst>(U) &&
+                            !R.areAllUsersVectorized(cast<Instruction>(U),
+                                                     &VectorizedVals);
+                   }) ||
             (VE && VE != E))
           continue;
         std::optional<unsigned> EEIdx = getExtractIndex(EE);
@@ -8528,7 +8599,7 @@ public:
     if (NumParts != 1 && UniqueBases.size() != 1) {
       UseVecBaseAsInput = true;
       VecBase = Constant::getNullValue(
-          FixedVectorType::get(VL.front()->getType(), CommonMask.size()));
+          FixedVectorType::get(ScalarTy, CommonMask.size()));
     }
     return VecBase;
   }
@@ -8556,8 +8627,7 @@ public:
       return;
     }
     assert(!CommonMask.empty() && "Expected non-empty common mask.");
-    auto *MaskVecTy =
-        FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
+    auto *MaskVecTy = FixedVectorType::get(ScalarTy, Mask.size());
     unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
     if (NumParts == 0 || NumParts >= Mask.size())
       NumParts = 1;
@@ -8574,8 +8644,7 @@ public:
       return;
     }
     assert(!CommonMask.empty() && "Expected non-empty common mask.");
-    auto *MaskVecTy =
-        FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
+    auto *MaskVecTy = FixedVectorType::get(ScalarTy, Mask.size());
     unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
     if (NumParts == 0 || NumParts >= Mask.size())
       NumParts = 1;
@@ -8675,7 +8744,7 @@ public:
     return ConstantVector::getSplat(
         ElementCount::getFixed(
             cast<FixedVectorType>(Root->getType())->getNumElements()),
-        getAllOnesValue(*R.DL, VL.front()->getType()));
+        getAllOnesValue(*R.DL, ScalarTy));
   }
   InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
   /// Finalize emission of the shuffles.
@@ -8821,7 +8890,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     if (isa<InsertElementInst>(VL[0]))
       return InstructionCost::getInvalid();
     return processBuildVector<ShuffleCostEstimator, InstructionCost>(
-        E, *TTI, VectorizedVals, *this, CheckedExtracts);
+        E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
   }
   InstructionCost CommonCost = 0;
   SmallVector<int> Mask;
@@ -9587,11 +9656,11 @@ bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
                                     /* MatchOr */ false);
 }
 
-bool BoUpSLP::isLoadCombineCandidate() const {
+bool BoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const {
   // Peek through a final sequence of stores and check if all operations are
   // likely to be load-combined.
-  unsigned NumElts = VectorizableTree[0]->Scalars.size();
-  for (Value *Scalar : VectorizableTree[0]->Scalars) {
+  unsigned NumElts = Stores.size();
+  for (Value *Scalar : Stores) {
     Value *X;
     if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
         !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
@@ -10861,12 +10930,8 @@ BoUpSLP::isGatherShuffledEntry(
   return Res;
 }
 
-InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
-                                       bool ForPoisonSrc) const {
-  // Find the type of the operands in VL.
-  Type *ScalarTy = VL[0]->getType();
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
-    ScalarTy = SI->getValueOperand()->getType();
+InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
+                                       Type *ScalarTy) const {
   auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
   bool DuplicateNonConst = false;
   // Find the cost of inserting/extracting values from the vector.
@@ -10877,6 +10942,11 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
   constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   InstructionCost Cost;
   auto EstimateInsertCost = [&](unsigned I, Value *V) {
+    if (V->getType() != ScalarTy) {
+      Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
+                                    TTI::CastContextHint::None, CostKind);
+      V = nullptr;
+    }
     if (!ForPoisonSrc)
       Cost +=
           TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
@@ -11104,7 +11174,7 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
   Builder.SetCurrentDebugLocation(Front->getDebugLoc());
 }
 
-Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
+Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy) {
   // List of instructions/lanes from current block and/or the blocks which are
   // part of the current loop. These instructions will be inserted at the end to
   // make it possible to optimize loops and hoist invariant instructions out of
@@ -11130,14 +11200,11 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
   auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
                                       Type *Ty) {
     Value *Scalar = V;
-    if (cast<VectorType>(Vec->getType())->getElementType() != Ty) {
-      assert(V->getType()->isIntegerTy() && Ty->isIntegerTy() &&
+    if (Scalar->getType() != Ty) {
+      assert(Scalar->getType()->isIntegerTy() && Ty->isIntegerTy() &&
              "Expected integer types only.");
-      Vec = Builder.CreateIntCast(
-          Vec,
-          VectorType::get(Ty,
-                          cast<VectorType>(Vec->getType())->getElementCount()),
-          !isKnownNonNegative(Vec, SimplifyQuery(*DL)));
+      Scalar = Builder.CreateIntCast(
+          Scalar, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
     }
 
     Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
@@ -11165,10 +11232,7 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
     }
     return Vec;
   };
-  Value *Val0 =
-      isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
-  Type *ScalarTy = Val0->getType();
-  FixedVectorType *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
   Value *Vec = Root ? Root : PoisonValue::get(VecTy);
   SmallVector<int> NonConsts;
   // Insert constant values at first.
@@ -11247,6 +11311,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
   /// resulting shuffle and the second operand sets to be the newly added
   /// operand. The \p CommonMask is transformed in the proper way after that.
   SmallVector<Value *, 2> InVectors;
+  Type *ScalarTy = nullptr;
   IRBuilderBase &Builder;
   BoUpSLP &R;
 
@@ -11357,9 +11422,20 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
         CommonMask[Idx] = Idx;
   }
 
+  /// Cast value \p V to the vector type with the same number of elements, but
+  /// the base type \p ScalarTy.
+  Value *castToScalarTyElem(Value *V) {
+    auto *VecTy = cast<VectorType>(V->getType());
+    if (VecTy->getElementType() == ScalarTy)
+      return V;
+    return Builder.CreateIntCast(
+        V, VectorType::get(ScalarTy, VecTy->getElementCount()),
+        !isKnownNonNegative(V, SimplifyQuery(*R.DL)));
+  }
+
 public:
-  ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R)
-      : Builder(Builder), R(R) {}
+  ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
+      : ScalarTy(ScalarTy), Builder(Builder), R(R) {}
 
   /// Adjusts extractelements after reusing them.
   Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
@@ -11384,6 +11460,8 @@ public:
           any_of(EI->users(), [&](User *U) {
             const TreeEntry *UTE = R.getTreeEntry(U);
             return !UTE || R.MultiNodeScalars.contains(U) ||
+                   (isa<GetElementPtrInst>(U) &&
+                    !R.areAllUsersVectorized(cast<Instruction>(U))) ||
                    count_if(R.VectorizableTree,
                             [&](const std::unique_ptr<TreeEntry> &TE) {
                               return any_of(TE->UserTreeIndices,
@@ -11396,8 +11474,10 @@ public:
         continue;
       R.eraseInstruction(EI);
     }
-    if (NumParts == 1 || UniqueBases.size() == 1)
+    if (NumParts == 1 || UniqueBases.size() == 1) {
+      VecBase = castToScalarTyElem(VecBase);
       return VecBase;
+    }
     UseVecBaseAsInput = true;
     auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
       for (auto [I, Idx] : enumerate(Mask))
@@ -11434,6 +11514,7 @@ public:
                "Expected vectors of the same size.");
         PrevSize = Size;
 #endif // NDEBUG
+        VecOp = castToScalarTyElem(VecOp);
         Bases[SubMask[I] < Size ? 0 : 1] = VecOp;
       }
       if (!Bases.front())
@@ -11489,10 +11570,10 @@ public:
       return std::nullopt;
     // Postpone gather emission, will be emitted after the end of the
     // process to keep correct order.
-    auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(),
-                                       E->getVectorFactor());
+    auto *ResVecTy = FixedVectorType::get(ScalarTy, E->getVectorFactor());
     return Builder.CreateAlignedLoad(
-        VecTy, PoisonValue::get(PointerType::getUnqual(VecTy->getContext())),
+        ResVecTy,
+        PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
         MaybeAlign());
   }
   /// Adds 2 input vectors (in form of tree entries) and the mask for their
@@ -11508,6 +11589,8 @@ public:
   /// Adds 2 input vectors and the mask for their shuffling.
   void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
     assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
+    V1 = castToScalarTyElem(V1);
+    V2 = castToScalarTyElem(V2);
     if (InVectors.empty()) {
       InVectors.push_back(V1);
       InVectors.push_back(V2);
@@ -11535,6 +11618,7 @@ public:
   }
   /// Adds another one input vector and the mask for the shuffling.
   void add(Value *V1, ArrayRef<int> Mask, bool = false) {
+    V1 = castToScalarTyElem(V1);
     if (InVectors.empty()) {
       if (!isa<FixedVectorType>(V1->getType())) {
         V1 = createShuffle(V1, nullptr, CommonMask);
@@ -11598,7 +11682,7 @@ public:
   }
   Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
                 Value *Root = nullptr) {
-    return R.gather(VL, Root);
+    return R.gather(VL, Root, ScalarTy);
   }
   Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
   /// Finalize emission of the shuffles.
@@ -11698,7 +11782,8 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
     }
     if (IsSameVE) {
       auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
-        ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
+        ShuffleInstructionBuilder ShuffleBuilder(
+            cast<VectorType>(V->getType())->getElementType(), Builder, *this);
         ShuffleBuilder.add(V, Mask);
         return ShuffleBuilder.finalize(std::nullopt);
       };
@@ -11773,7 +11858,8 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
 }
 
 template <typename BVTy, typename ResTy, typename... Args>
-ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
+ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
+                                  Args &...Params) {
   assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
   unsigned VF = E->getVectorFactor();
 
@@ -11821,7 +11907,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
     }
     return true;
   };
-  BVTy ShuffleBuilder(Params...);
+  BVTy ShuffleBuilder(ScalarTy, Params...);
   ResTy Res = ResTy();
   SmallVector<int> Mask;
   SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
@@ -11830,7 +11916,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
   bool UseVecBaseAsInput = false;
   SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
   SmallVector<SmallVector<const TreeEntry *>> Entries;
-  Type *ScalarTy = GatheredScalars.front()->getType();
+  Type *OrigScalarTy = GatheredScalars.front()->getType();
   auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());
   unsigned NumParts = TTI->getNumberOfParts(VecTy);
   if (NumParts == 0 || NumParts >= GatheredScalars.size())
@@ -11865,7 +11951,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
               GatheredScalars.size() != VF) {
             Resized = true;
             GatheredScalars.append(VF - GatheredScalars.size(),
-                                   PoisonValue::get(ScalarTy));
+                                   PoisonValue::get(OrigScalarTy));
           }
       }
     }
@@ -11925,12 +12011,12 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
               });
             }))
           GatheredScalars.append(VF - GatheredScalars.size(),
-                                 PoisonValue::get(ScalarTy));
+                                 PoisonValue::get(OrigScalarTy));
       }
       // Remove shuffled elements from list of gathers.
       for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
         if (Mask[I] != PoisonMaskElem)
-          GatheredScalars[I] = PoisonValue::get(ScalarTy);
+          GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
       }
     }
   }
@@ -11941,7 +12027,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
     // such sequences.
     bool IsSplat = IsRootPoison && isSplat(Scalars) &&
                    (Scalars.size() > 2 || Scalars.front() == Scalars.back());
-    Scalars.append(VF - Scalars.size(), PoisonValue::get(ScalarTy));
+    Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
     SmallVector<int> UndefPos;
     DenseMap<Value *, unsigned> UniquePositions;
     // Gather unique non-const values and all constant values.
@@ -11963,7 +12049,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
       ++NumNonConsts;
       SinglePos = I;
       Value *OrigV = V;
-      Scalars[I] = PoisonValue::get(ScalarTy);
+      Scalars[I] = PoisonValue::get(OrigScalarTy);
       if (IsSplat) {
         Scalars.front() = OrigV;
         ReuseMask[I] = 0;
@@ -11979,7 +12065,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         ReuseMask.assign(VF, PoisonMaskElem);
         std::swap(Scalars.front(), Scalars[SinglePos]);
         if (!UndefPos.empty() && UndefPos.front() == 0)
-          Scalars.front() = UndefValue::get(ScalarTy);
+          Scalars.front() = UndefValue::get(OrigScalarTy);
       }
       ReuseMask[SinglePos] = SinglePos;
     } else if (!UndefPos.empty() && IsSplat) {
@@ -12009,7 +12095,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
           // Replace the undef by the poison, in the mask it is replaced by
           // non-poisoned scalar already.
           if (I != Pos)
-            Scalars[I] = PoisonValue::get(ScalarTy);
+            Scalars[I] = PoisonValue::get(OrigScalarTy);
         }
       } else {
         // Replace undefs by the poisons, emit broadcast and then emit
@@ -12017,7 +12103,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         for (int I : UndefPos) {
           ReuseMask[I] = PoisonMaskElem;
           if (isa<UndefValue>(Scalars[I]))
-            Scalars[I] = PoisonValue::get(ScalarTy);
+            Scalars[I] = PoisonValue::get(OrigScalarTy);
         }
         NeedFreeze = true;
       }
@@ -12072,9 +12158,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
       } else {
         IsUsedInExpr = false;
-        ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get(
-                               ScalarTy, GatheredScalars.size())),
-                           ExtractMask, /*ForExtracts=*/true);
+        ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
+                           /*ForExtracts=*/true);
       }
     }
     if (!GatherShuffles.empty()) {
@@ -12155,9 +12240,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
     // contains only constant to build final vector and then shuffle.
     for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
       if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
-        NonConstants[I] = PoisonValue::get(ScalarTy);
+        NonConstants[I] = PoisonValue::get(OrigScalarTy);
       else
-        GatheredScalars[I] = PoisonValue::get(ScalarTy);
+        GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
     }
     // Generate constants for final shuffle and build a mask for them.
     if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
@@ -12203,9 +12288,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
   return Res;
 }
 
-Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
-  return processBuildVector<ShuffleInstructionBuilder, Value *>(E, Builder,
-                                                                *this);
+Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
+  return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
+                                                                Builder, *this);
 }
 
 Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
@@ -12218,18 +12303,28 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
     return E->VectorizedValue;
   }
 
+  Value *V = E->Scalars.front();
+  Type *ScalarTy = V->getType();
+  if (auto *Store = dyn_cast<StoreInst>(V))
+    ScalarTy = Store->getValueOperand()->getType();
+  else if (auto *IE = dyn_cast<InsertElementInst>(V))
+    ScalarTy = IE->getOperand(1)->getType();
+  auto It = MinBWs.find(E);
+  if (It != MinBWs.end())
+    ScalarTy = IntegerType::get(F->getContext(), It->second.first);
+  auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
   if (E->State == TreeEntry::NeedToGather) {
     // Set insert point for non-reduction initial nodes.
     if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
       setInsertPointAfterBundle(E);
-    Value *Vec = createBuildVector(E);
+    Value *Vec = createBuildVector(E, ScalarTy);
     E->VectorizedValue = Vec;
     return Vec;
   }
 
   bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
   auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
-    ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
+    ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
     if (E->getOpcode() == Instruction::Store) {
       ArrayRef<int> Mask =
           ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
@@ -12250,14 +12345,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
   Instruction *VL0 = E->getMainOp();
-  Type *ScalarTy = VL0->getType();
-  if (auto *Store = dyn_cast<StoreInst>(VL0))
-    ScalarTy = Store->getValueOperand()->getType();
-  else if (auto *IE = dyn_cast<InsertElementInst>(VL0))
-    ScalarTy = IE->getOperand(1)->getType();
-  auto It = MinBWs.find(E);
-  if (It != MinBWs.end())
-    ScalarTy = IntegerType::get(F->getContext(), It->second.first);
   auto GetOperandSignedness = [&](unsigned Idx) {
     const TreeEntry *OpE = getOperandEntry(E, Idx);
     bool IsSigned = false;
@@ -12270,7 +12357,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       });
     return IsSigned;
   };
-  auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
       assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
@@ -13145,7 +13231,8 @@ Value *BoUpSLP::vectorizeTree(
     auto *TE = const_cast<TreeEntry *>(E);
     if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
       if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
-              TE->UserTreeIndices.front().EdgeIdx)))
+              TE->UserTreeIndices.front().EdgeIdx)) &&
+          VecTE->isSame(TE->Scalars))
         // Found gather node which is absolutely the same as one of the
         // vectorized nodes. It may happen after reordering.
         continue;
@@ -13316,8 +13403,11 @@ Value *BoUpSLP::vectorizeTree(
             // Leave the GEPs as is, they are free in most cases and better to
             // keep them as GEPs.
             auto *CloneGEP = GEP->clone();
-            CloneGEP->insertBefore(*Builder.GetInsertBlock(),
-                                   Builder.GetInsertPoint());
+            if (isa<Instruction>(Vec))
+              CloneGEP->insertBefore(*Builder.GetInsertBlock(),
+                                     Builder.GetInsertPoint());
+            else
+              CloneGEP->insertBefore(GEP);
             if (GEP->hasName())
               CloneGEP->takeName(GEP);
             Ex = CloneGEP;
@@ -13520,7 +13610,8 @@ Value *BoUpSLP::vectorizeTree(
       else
         CombinedMask2[I] = Mask[I] - VF;
     }
-    ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
+    ShuffleInstructionBuilder ShuffleBuilder(
+        cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
     ShuffleBuilder.add(V1, CombinedMask1);
     if (V2)
       ShuffleBuilder.add(V2, CombinedMask2);
@@ -14555,13 +14646,27 @@ bool BoUpSLP::collectValuesToDemote(
       return false;
     bool Res = all_of(
         E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
-    // Gather demoted constant operands.
-    if (Res && E.State == TreeEntry::NeedToGather &&
-        all_of(E.Scalars, IsaPred<Constant>))
-      ToDemote.push_back(E.Idx);
+    // Demote gathers.
+    if (Res && E.State == TreeEntry::NeedToGather) {
+      // Check possible extractelement instructions bases and final vector
+      // length.
+      SmallPtrSet<Value *, 4> UniqueBases;
+      for (Value *V : E.Scalars) {
+        auto *EE = dyn_cast<ExtractElementInst>(V);
+        if (!EE)
+          continue;
+        UniqueBases.insert(EE->getVectorOperand());
+      }
+      const unsigned VF = E.Scalars.size();
+      Type *OrigScalarTy = E.Scalars.front()->getType();
+      if (UniqueBases.size() <= 2 ||
+          TTI->getNumberOfParts(FixedVectorType::get(OrigScalarTy, VF)) ==
+              TTI->getNumberOfParts(FixedVectorType::get(
+                  IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
+        ToDemote.push_back(E.Idx);
+    }
     return Res;
   };
-  // TODO: improve handling of gathered values and others.
   if (E.State == TreeEntry::NeedToGather || !Visited.insert(&E).second ||
       any_of(E.Scalars, [&](Value *V) {
         return all_of(V->users(), [&](User *U) {
@@ -15072,11 +15177,16 @@ void BoUpSLP::computeMinimumValueSizes() {
       IsSignedCmp =
           NodeIdx < VectorizableTree.size() &&
           any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
-                 [](const EdgeInfo &EI) {
+                 [&](const EdgeInfo &EI) {
                    return EI.UserTE->getOpcode() == Instruction::ICmp &&
-                          any_of(EI.UserTE->Scalars, [](Value *V) {
+                          any_of(EI.UserTE->Scalars, [&](Value *V) {
                             auto *IC = dyn_cast<ICmpInst>(V);
-                            return IC && IC->isSigned();
+                            return IC &&
+                                   (IC->isSigned() ||
+                                    !isKnownNonNegative(IC->getOperand(0),
+                                                        SimplifyQuery(*DL)) ||
+                                    !isKnownNonNegative(IC->getOperand(1),
+                                                        SimplifyQuery(*DL)));
                           });
                  });
     }
@@ -15205,8 +15315,11 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
   return Changed;
 }
 
-bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
-                                            unsigned Idx, unsigned MinVF) {
+std::optional<bool>
+SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
+                                       unsigned Idx, unsigned MinVF,
+                                       unsigned &Size) {
+  Size = 0;
   LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
                     << "\n");
   const unsigned Sz = R.getVectorElementSize(Chain[0]);
@@ -15223,11 +15336,42 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
   LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
                     << "\n");
 
+  SetVector<Value *> ValOps;
+  for (Value *V : Chain)
+    ValOps.insert(cast<StoreInst>(V)->getValueOperand());
+  // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
+  InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
+  if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
+    DenseSet<Value *> Stores(Chain.begin(), Chain.end());
+    bool IsPowerOf2 =
+        isPowerOf2_32(ValOps.size()) ||
+        (VectorizeNonPowerOf2 && isPowerOf2_32(ValOps.size() + 1));
+    if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
+         (!S.MainOp->isSafeToRemove() ||
+          any_of(ValOps.getArrayRef(),
+                 [&](Value *V) {
+                   return !isa<ExtractElementInst>(V) &&
+                          (V->getNumUses() > Chain.size() ||
+                           any_of(V->users(), [&](User *U) {
+                             return !Stores.contains(U);
+                           }));
+                 }))) ||
+        (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
+      Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
+      return false;
+    }
+  }
+  if (R.isLoadCombineCandidate(Chain))
+    return true;
   R.buildTree(Chain);
-  if (R.isTreeTinyAndNotFullyVectorizable())
-    return false;
-  if (R.isLoadCombineCandidate())
+  // Check if tree tiny and store itself or its value is not vectorized.
+  if (R.isTreeTinyAndNotFullyVectorizable()) {
+    if (R.isGathered(Chain.front()) ||
+        R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
+      return std::nullopt;
+    Size = R.getTreeSize();
     return false;
+  }
   R.reorderTopToBottom();
   R.reorderBottomToTop();
   R.buildExternalUses();
@@ -15235,6 +15379,9 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
   R.computeMinimumValueSizes();
   R.transformNodes();
 
+  Size = R.getTreeSize();
+  if (S.getOpcode() == Instruction::Load)
+    Size = 2; // cut off masked gather small trees
   InstructionCost Cost = R.getTreeCost();
 
   LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
@@ -15256,17 +15403,45 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
   return false;
 }
 
-bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
-                                        BoUpSLP &R) {
+/// Checks if the quadratic mean deviation is less than 90% of the mean size.
+static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
+                           bool First) {
+  unsigned Num = 0;
+  uint64_t Sum = std::accumulate(
+      Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
+      [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
+        unsigned Size = First ? Val.first : Val.second;
+        if (Size == 1)
+          return V;
+        ++Num;
+        return V + Size;
+      });
+  if (Num == 0)
+    return true;
+  uint64_t Mean = Sum / Num;
+  if (Mean == 0)
+    return true;
+  uint64_t Dev = std::accumulate(
+                     Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
+                     [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
+                       unsigned P = First ? Val.first : Val.second;
+                       if (P == 1)
+                         return V;
+                       return V + (P - Mean) * (P - Mean);
+                     }) /
+                 Num;
+  return Dev * 81 / (Mean * Mean) == 0;
+}
+
+bool SLPVectorizerPass::vectorizeStores(
+    ArrayRef<StoreInst *> Stores, BoUpSLP &R,
+    DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
+        &Visited) {
   // We may run into multiple chains that merge into a single chain. We mark the
   // stores that we vectorized so that we don't visit the same store twice.
   BoUpSLP::ValueSet VectorizedStores;
   bool Changed = false;
 
-  // Stores the pair of stores (first_store, last_store) in a range, that were
-  // already tried to be vectorized. Allows to skip the store ranges that were
-  // already tried to be vectorized but the attempts were unsuccessful.
-  DenseSet<std::pair<Value *, Value *>> TriedSequences;
   struct StoreDistCompare {
     bool operator()(const std::pair<unsigned, int> &Op1,
                     const std::pair<unsigned, int> &Op2) const {
@@ -15294,7 +15469,14 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
         PrevDist = DataVar.second;
       });
 
-      if (Operands.size() <= 1)
+      if (Operands.size() <= 1 ||
+          !Visited
+               .insert({Operands.front(),
+                        cast<StoreInst>(Operands.front())->getValueOperand(),
+                        Operands.back(),
+                        cast<StoreInst>(Operands.back())->getValueOperand(),
+                        Operands.size()})
+               .second)
         continue;
 
       unsigned MaxVecRegSize = R.getMaxVecRegSize();
@@ -15303,13 +15485,19 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
 
       unsigned MaxVF =
           std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
+      unsigned MaxRegVF = MaxVF;
       auto *Store = cast<StoreInst>(Operands[0]);
       Type *StoreTy = Store->getValueOperand()->getType();
       Type *ValueTy = StoreTy;
       if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
         ValueTy = Trunc->getSrcTy();
-      unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
-          R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy, ValueTy));
+      if (ValueTy == StoreTy &&
+          R.getVectorElementSize(Store->getValueOperand()) <= EltSize)
+        MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
+      unsigned MinVF = std::max<unsigned>(
+          2, PowerOf2Ceil(TTI->getStoreMinimumVF(
+                 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
+                 ValueTy)));
 
       if (MaxVF < MinVF) {
         LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
@@ -15324,7 +15512,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
         // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
         // lanes are used.
         unsigned CandVF = Operands.size();
-        if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF)
+        if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxRegVF)
           NonPowerOf2VF = CandVF;
       }
 
@@ -15335,40 +15523,184 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
         VF = Size > MaxVF ? NonPowerOf2VF : Size;
         Size *= 2;
       });
-      unsigned StartIdx = 0;
-      for (unsigned Size : CandidateVFs) {
-        for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
-          ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
-          assert(
-              all_of(
-                  Slice,
-                  [&](Value *V) {
-                    return cast<StoreInst>(V)->getValueOperand()->getType() ==
-                           cast<StoreInst>(Slice.front())
-                               ->getValueOperand()
-                               ->getType();
-                  }) &&
-              "Expected all operands of same type.");
-          if (!VectorizedStores.count(Slice.front()) &&
-              !VectorizedStores.count(Slice.back()) &&
-              TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))
-                  .second &&
-              vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
-            // Mark the vectorized stores so that we don't vectorize them again.
-            VectorizedStores.insert(Slice.begin(), Slice.end());
-            Changed = true;
-            // If we vectorized initial block, no need to try to vectorize it
-            // again.
-            if (Cnt == StartIdx)
-              StartIdx += Size;
-            Cnt += Size;
-            continue;
+      unsigned End = Operands.size();
+      unsigned Repeat = 0;
+      constexpr unsigned MaxAttempts = 4;
+      OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
+      for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
+        P.first = P.second = 1;
+      });
+      DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
+      auto IsNotVectorized = [](bool First,
+                                const std::pair<unsigned, unsigned> &P) {
+        return First ? P.first > 0 : P.second > 0;
+      };
+      auto IsVectorized = [](bool First,
+                             const std::pair<unsigned, unsigned> &P) {
+        return First ? P.first == 0 : P.second == 0;
+      };
+      auto VFIsProfitable = [](bool First, unsigned Size,
+                               const std::pair<unsigned, unsigned> &P) {
+        return First ? Size >= P.first : Size >= P.second;
+      };
+      auto FirstSizeSame = [](unsigned Size,
+                              const std::pair<unsigned, unsigned> &P) {
+        return Size == P.first;
+      };
+      while (true) {
+        ++Repeat;
+        bool RepeatChanged = false;
+        bool AnyProfitableGraph;
+        for (unsigned Size : CandidateVFs) {
+          AnyProfitableGraph = false;
+          unsigned StartIdx = std::distance(
+              RangeSizes.begin(),
+              find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
+                                            std::placeholders::_1)));
+          while (StartIdx < End) {
+            unsigned EndIdx =
+                std::distance(RangeSizes.begin(),
+                              find_if(RangeSizes.drop_front(StartIdx),
+                                      std::bind(IsVectorized, Size >= MaxRegVF,
+                                                std::placeholders::_1)));
+            unsigned Sz = EndIdx >= End ? End : EndIdx;
+            for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
+              if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
+                                  Size >= MaxRegVF)) {
+                ++Cnt;
+                continue;
+              }
+              ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
+              assert(all_of(Slice,
+                            [&](Value *V) {
+                              return cast<StoreInst>(V)
+                                         ->getValueOperand()
+                                         ->getType() ==
+                                     cast<StoreInst>(Slice.front())
+                                         ->getValueOperand()
+                                         ->getType();
+                            }) &&
+                     "Expected all operands of same type.");
+              if (!NonSchedulable.empty()) {
+                auto [NonSchedSizeMax, NonSchedSizeMin] =
+                    NonSchedulable.lookup(Slice.front());
+                if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
+                  Cnt += NonSchedSizeMax;
+                  continue;
+                }
+              }
+              unsigned TreeSize;
+              std::optional<bool> Res =
+                  vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
+              if (!Res) {
+                NonSchedulable
+                    .try_emplace(Slice.front(), std::make_pair(Size, Size))
+                    .first->getSecond()
+                    .second = Size;
+              } else if (*Res) {
+                // Mark the vectorized stores so that we don't vectorize them
+                // again.
+                VectorizedStores.insert(Slice.begin(), Slice.end());
+                // Mark the vectorized stores so that we don't vectorize them
+                // again.
+                AnyProfitableGraph = RepeatChanged = Changed = true;
+                // If we vectorized initial block, no need to try to vectorize
+                // it again.
+                for_each(RangeSizes.slice(Cnt, Size),
+                         [](std::pair<unsigned, unsigned> &P) {
+                           P.first = P.second = 0;
+                         });
+                if (Cnt < StartIdx + MinVF) {
+                  for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
+                           [](std::pair<unsigned, unsigned> &P) {
+                             P.first = P.second = 0;
+                           });
+                  StartIdx = Cnt + Size;
+                }
+                if (Cnt > Sz - Size - MinVF) {
+                  for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
+                           [](std::pair<unsigned, unsigned> &P) {
+                             P.first = P.second = 0;
+                           });
+                  if (Sz == End)
+                    End = Cnt;
+                  Sz = Cnt;
+                }
+                Cnt += Size;
+                continue;
+              }
+              if (Size > 2 && Res &&
+                  !all_of(RangeSizes.slice(Cnt, Size),
+                          std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
+                                    std::placeholders::_1))) {
+                Cnt += Size;
+                continue;
+              }
+              // Check for the very big VFs that we're not rebuilding same
+              // trees, just with larger number of elements.
+              if (Size > MaxRegVF && TreeSize > 1 &&
+                  all_of(RangeSizes.slice(Cnt, Size),
+                         std::bind(FirstSizeSame, TreeSize,
+                                   std::placeholders::_1))) {
+                Cnt += Size;
+                while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
+                  ++Cnt;
+                continue;
+              }
+              if (TreeSize > 1)
+                for_each(RangeSizes.slice(Cnt, Size),
+                         [&](std::pair<unsigned, unsigned> &P) {
+                           if (Size >= MaxRegVF)
+                             P.second = std::max(P.second, TreeSize);
+                           else
+                             P.first = std::max(P.first, TreeSize);
+                         });
+              ++Cnt;
+              AnyProfitableGraph = true;
+            }
+            if (StartIdx >= End)
+              break;
+            if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
+              AnyProfitableGraph = true;
+            StartIdx = std::distance(
+                RangeSizes.begin(),
+                find_if(RangeSizes.drop_front(Sz),
+                        std::bind(IsNotVectorized, Size >= MaxRegVF,
+                                  std::placeholders::_1)));
           }
-          ++Cnt;
+          if (!AnyProfitableGraph && Size >= MaxRegVF)
+            break;
         }
-        // Check if the whole array was vectorized already - exit.
-        if (StartIdx >= Operands.size())
+        // All values vectorized - exit.
+        if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
+              return P.first == 0 && P.second == 0;
+            }))
+          break;
+        // Check if tried all attempts or no need for the last attempts at all.
+        if (Repeat >= MaxAttempts ||
+            (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
           break;
+        constexpr unsigned StoresLimit = 64;
+        const unsigned MaxTotalNum = bit_floor(std::min<unsigned>(
+            Operands.size(),
+            static_cast<unsigned>(
+                End -
+                std::distance(
+                    RangeSizes.begin(),
+                    find_if(RangeSizes, std::bind(IsNotVectorized, true,
+                                                  std::placeholders::_1))) +
+                1)));
+        unsigned VF = PowerOf2Ceil(CandidateVFs.front()) * 2;
+        if (VF > MaxTotalNum || VF >= StoresLimit)
+          break;
+        for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
+          if (P.first != 0)
+            P.first = std::max(P.second, P.first);
+        });
+        // Last attempt to vectorize max number of elements, if all previous
+        // attempts were unsuccessful because of the cost issues.
+        CandidateVFs.clear();
+        CandidateVFs.push_back(VF);
       }
     }
   };
@@ -18186,6 +18518,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
   };
 
   // Attempt to sort and vectorize each of the store-groups.
+  DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
   for (auto &Pair : Stores) {
     if (Pair.second.size() < 2)
       continue;
@@ -18203,8 +18536,8 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
                                             Pair.second.rend());
     Changed |= tryToVectorizeSequence<StoreInst>(
         ReversedStores, StoreSorter, AreCompatibleStores,
-        [this, &R](ArrayRef<StoreInst *> Candidates, bool) {
-          return vectorizeStores(Candidates, R);
+        [&](ArrayRef<StoreInst *> Candidates, bool) {
+          return vectorizeStores(Candidates, R, Attempted);
         },
         /*MaxVFOnly=*/false, R);
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c74329a0bcc4..71387bf5b7e9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3215,6 +3215,9 @@ public:
     return Value2VPValue[V];
   }
 
+  /// Return the live-in VPValue for \p V, if there is one or nullptr otherwise.
+  VPValue *getLiveIn(Value *V) const { return Value2VPValue.lookup(V); }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the live-ins of this VPlan to \p O.
   void printLiveIns(raw_ostream &O) const;
@@ -3278,8 +3281,8 @@ public:
 private:
   /// Add to the given dominator tree the header block and every new basic block
   /// that was created between it and the latch block, inclusive.
-  static void updateDominatorTree(DominatorTree *DT, BasicBlock *LoopLatchBB,
-                                  BasicBlock *LoopPreHeaderBB,
+  static void updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB,
+                                  BasicBlock *LoopLatchBB,
                                   BasicBlock *LoopExitBB);
 };
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d7bc128dcfe6..a7337f7aa94d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -506,13 +506,12 @@ static void removeDeadRecipes(VPlan &Plan) {
   }
 }
 
-static VPValue *createScalarIVSteps(VPlan &Plan,
-                                    InductionDescriptor::InductionKind Kind,
-                                    Instruction::BinaryOps InductionOpcode,
-                                    FPMathOperator *FPBinOp,
-                                    ScalarEvolution &SE, Instruction *TruncI,
-                                    VPValue *StartV, VPValue *Step,
-                                    VPBasicBlock::iterator IP) {
+static VPScalarIVStepsRecipe *
+createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
+                    Instruction::BinaryOps InductionOpcode,
+                    FPMathOperator *FPBinOp, ScalarEvolution &SE,
+                    Instruction *TruncI, VPValue *StartV, VPValue *Step,
+                    VPBasicBlock::iterator IP) {
   VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
   VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
   VPSingleDefRecipe *BaseIV = CanonicalIV;
@@ -579,16 +578,13 @@ static void legalizeAndOptimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
       VPValue *StartV =
           Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0));
       VPValue *StepV = PtrIV->getOperand(1);
-      VPRecipeBase *Steps =
-          createScalarIVSteps(Plan, InductionDescriptor::IK_IntInduction,
-                              Instruction::Add, nullptr, SE, nullptr, StartV,
-                              StepV, InsertPt)
-              ->getDefiningRecipe();
+      VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
+          Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
+          SE, nullptr, StartV, StepV, InsertPt);
 
-      auto *Recipe =
-          new VPInstruction(VPInstruction::PtrAdd,
-                            {PtrIV->getStartValue(), Steps->getVPSingleValue()},
-                            PtrIV->getDebugLoc(), "next.gep");
+      auto *Recipe = new VPInstruction(VPInstruction::PtrAdd,
+                                       {PtrIV->getStartValue(), Steps},
+                                       PtrIV->getDebugLoc(), "next.gep");
 
       Recipe->insertAfter(Steps);
       PtrIV->replaceAllUsesWith(Recipe);
@@ -606,7 +602,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
       continue;
 
     const InductionDescriptor &ID = WideIV->getInductionDescriptor();
-    VPValue *Steps = createScalarIVSteps(
+    VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
         Plan, ID.getKind(), ID.getInductionOpcode(),
         dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()), SE,
         WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
diff --git a/llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll b/llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll
index 836a028ad6aa..f491b086107a 100644
--- a/llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll
@@ -7,58 +7,58 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @vector_reverse() #0{
 ; CHECK-LABEL: 'vector_reverse'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <32 x i8> @llvm.experimental.vector.reverse.v32i8(<32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %3 = call <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %4 = call <16 x i16> @llvm.experimental.vector.reverse.v16i16(<16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %6 = call <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <4 x i64> @llvm.experimental.vector.reverse.v4i64(<4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = call <16 x half> @llvm.experimental.vector.reverse.v16f16(<16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %11 = call <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = call <8 x float> @llvm.experimental.vector.reverse.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <4 x double> @llvm.experimental.vector.reverse.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %15 = call <8 x bfloat> @llvm.experimental.vector.reverse.v8bf16(<8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %16 = call <16 x bfloat> @llvm.experimental.vector.reverse.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = call <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <32 x i8> @llvm.vector.reverse.v32i8(<32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %3 = call <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %4 = call <16 x i16> @llvm.vector.reverse.v16i16(<16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %6 = call <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <4 x i64> @llvm.vector.reverse.v4i64(<4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <8 x half> @llvm.vector.reverse.v8f16(<8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = call <16 x half> @llvm.vector.reverse.v16f16(<16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %11 = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = call <8 x float> @llvm.vector.reverse.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <2 x double> @llvm.vector.reverse.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <4 x double> @llvm.vector.reverse.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %15 = call <8 x bfloat> @llvm.vector.reverse.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %16 = call <16 x bfloat> @llvm.vector.reverse.v16bf16(<16 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
-  call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> undef)
-  call <32 x i8> @llvm.experimental.vector.reverse.v32i8(<32 x i8> undef)
-  call <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16> undef)
-  call <16 x i16> @llvm.experimental.vector.reverse.v16i16(<16 x i16> undef)
-  call <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32> undef)
-  call <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32> undef)
-  call <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64> undef)
-  call <4 x i64> @llvm.experimental.vector.reverse.v4i64(<4 x i64> undef)
-  call <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half> undef)
-  call <16 x half> @llvm.experimental.vector.reverse.v16f16(<16 x half> undef)
-  call <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float> undef)
-  call <8 x float> @llvm.experimental.vector.reverse.v8f32(<8 x float> undef)
-  call <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double> undef)
-  call <4 x double> @llvm.experimental.vector.reverse.v4f64(<4 x double> undef)
-  call <8 x bfloat> @llvm.experimental.vector.reverse.v8bf16(<8 x bfloat> undef)
-  call <16 x bfloat> @llvm.experimental.vector.reverse.v16bf16(<16 x bfloat> undef)
+  call <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8> undef)
+  call <32 x i8> @llvm.vector.reverse.v32i8(<32 x i8> undef)
+  call <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16> undef)
+  call <16 x i16> @llvm.vector.reverse.v16i16(<16 x i16> undef)
+  call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> undef)
+  call <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32> undef)
+  call <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64> undef)
+  call <4 x i64> @llvm.vector.reverse.v4i64(<4 x i64> undef)
+  call <8 x half> @llvm.vector.reverse.v8f16(<8 x half> undef)
+  call <16 x half> @llvm.vector.reverse.v16f16(<16 x half> undef)
+  call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> undef)
+  call <8 x float> @llvm.vector.reverse.v8f32(<8 x float> undef)
+  call <2 x double> @llvm.vector.reverse.v2f64(<2 x double> undef)
+  call <4 x double> @llvm.vector.reverse.v4f64(<4 x double> undef)
+  call <8 x bfloat> @llvm.vector.reverse.v8bf16(<8 x bfloat> undef)
+  call <16 x bfloat> @llvm.vector.reverse.v16bf16(<16 x bfloat> undef)
   ret void
 }
 
 attributes #0 = { "target-features"="+sve,+bf16" }
-declare <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8>)
-declare <32 x i8> @llvm.experimental.vector.reverse.v32i8(<32 x i8>)
-declare <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16>)
-declare <16 x i16> @llvm.experimental.vector.reverse.v16i16(<16 x i16>)
-declare <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32>)
-declare <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32>)
-declare <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64>)
-declare <4 x i64> @llvm.experimental.vector.reverse.v4i64(<4 x i64>)
-declare <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half>)
-declare <16 x half> @llvm.experimental.vector.reverse.v16f16(<16 x half>)
-declare <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float>)
-declare <8 x float> @llvm.experimental.vector.reverse.v8f32(<8 x float>)
-declare <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double>)
-declare <4 x double> @llvm.experimental.vector.reverse.v4f64(<4 x double>)
-declare <8 x bfloat> @llvm.experimental.vector.reverse.v8bf16(<8 x bfloat>)
-declare <16 x bfloat> @llvm.experimental.vector.reverse.v16bf16(<16 x bfloat>)
+declare <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8>)
+declare <32 x i8> @llvm.vector.reverse.v32i8(<32 x i8>)
+declare <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16>)
+declare <16 x i16> @llvm.vector.reverse.v16i16(<16 x i16>)
+declare <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32>)
+declare <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32>)
+declare <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64>)
+declare <4 x i64> @llvm.vector.reverse.v4i64(<4 x i64>)
+declare <8 x half> @llvm.vector.reverse.v8f16(<8 x half>)
+declare <16 x half> @llvm.vector.reverse.v16f16(<16 x half>)
+declare <4 x float> @llvm.vector.reverse.v4f32(<4 x float>)
+declare <8 x float> @llvm.vector.reverse.v8f32(<8 x float>)
+declare <2 x double> @llvm.vector.reverse.v2f64(<2 x double>)
+declare <4 x double> @llvm.vector.reverse.v4f64(<4 x double>)
+declare <8 x bfloat> @llvm.vector.reverse.v8bf16(<8 x bfloat>)
+declare <16 x bfloat> @llvm.vector.reverse.v16bf16(<16 x bfloat>)
diff --git a/llvm/test/Analysis/CostModel/AArch64/splice.ll b/llvm/test/Analysis/CostModel/AArch64/splice.ll
index f5afdff41b1d..1d76a4838cee 100644
--- a/llvm/test/Analysis/CostModel/AArch64/splice.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/splice.ll
@@ -5,96 +5,96 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @vector_splice() #0 {
 ; CHECK-LABEL: 'vector_splice'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v16i8 = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v32i8 = call <32 x i8> @llvm.experimental.vector.splice.v32i8(<32 x i8> zeroinitializer, <32 x i8> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2i16 = call <2 x i16> @llvm.experimental.vector.splice.v2i16(<2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4i16 = call <4 x i16> @llvm.experimental.vector.splice.v4i16(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8i16 = call <8 x i16> @llvm.experimental.vector.splice.v8i16(<8 x i16> zeroinitializer, <8 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v16i16 = call <16 x i16> @llvm.experimental.vector.splice.v16i16(<16 x i16> zeroinitializer, <16 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4i32 = call <4 x i32> @llvm.experimental.vector.splice.v4i32(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v8i32 = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2i64 = call <2 x i64> @llvm.experimental.vector.splice.v2i64(<2 x i64> zeroinitializer, <2 x i64> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v4i64 = call <4 x i64> @llvm.experimental.vector.splice.v4i64(<4 x i64> zeroinitializer, <4 x i64> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2f16 = call <2 x half> @llvm.experimental.vector.splice.v2f16(<2 x half> zeroinitializer, <2 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4f16 = call <4 x half> @llvm.experimental.vector.splice.v4f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8f16 = call <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half> zeroinitializer, <8 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v16f16 = call <16 x half> @llvm.experimental.vector.splice.v16f16(<16 x half> zeroinitializer, <16 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2f32 = call <2 x float> @llvm.experimental.vector.splice.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4f32 = call <4 x float> @llvm.experimental.vector.splice.v4f32(<4 x float> zeroinitializer, <4 x float> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v8f32 = call <8 x float> @llvm.experimental.vector.splice.v8f32(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2f64 = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> zeroinitializer, <2 x double> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v4f64 = call <4 x double> @llvm.experimental.vector.splice.v4f64(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2bf16 = call <2 x bfloat> @llvm.experimental.vector.splice.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4bf16 = call <4 x bfloat> @llvm.experimental.vector.splice.v4bf16(<4 x bfloat> zeroinitializer, <4 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8bf16 = call <8 x bfloat> @llvm.experimental.vector.splice.v8bf16(<8 x bfloat> zeroinitializer, <8 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v16bf16 = call <16 x bfloat> @llvm.experimental.vector.splice.v16bf16(<16 x bfloat> zeroinitializer, <16 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v16i1 = call <16 x i1> @llvm.experimental.vector.splice.v16i1(<16 x i1> zeroinitializer, <16 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8i1 = call <8 x i1> @llvm.experimental.vector.splice.v8i1(<8 x i1> zeroinitializer, <8 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4i1 = call <4 x i1> @llvm.experimental.vector.splice.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2i1 = call <2 x i1> @llvm.experimental.vector.splice.v2i1(<2 x i1> zeroinitializer, <2 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %splice.v2i128 = call <2 x i128> @llvm.experimental.vector.splice.v2i128(<2 x i128> zeroinitializer, <2 x i128> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v16i8 = call <16 x i8> @llvm.vector.splice.v16i8(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v32i8 = call <32 x i8> @llvm.vector.splice.v32i8(<32 x i8> zeroinitializer, <32 x i8> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2i16 = call <2 x i16> @llvm.vector.splice.v2i16(<2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4i16 = call <4 x i16> @llvm.vector.splice.v4i16(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8i16 = call <8 x i16> @llvm.vector.splice.v8i16(<8 x i16> zeroinitializer, <8 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v16i16 = call <16 x i16> @llvm.vector.splice.v16i16(<16 x i16> zeroinitializer, <16 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4i32 = call <4 x i32> @llvm.vector.splice.v4i32(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v8i32 = call <8 x i32> @llvm.vector.splice.v8i32(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2i64 = call <2 x i64> @llvm.vector.splice.v2i64(<2 x i64> zeroinitializer, <2 x i64> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v4i64 = call <4 x i64> @llvm.vector.splice.v4i64(<4 x i64> zeroinitializer, <4 x i64> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2f16 = call <2 x half> @llvm.vector.splice.v2f16(<2 x half> zeroinitializer, <2 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4f16 = call <4 x half> @llvm.vector.splice.v4f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8f16 = call <8 x half> @llvm.vector.splice.v8f16(<8 x half> zeroinitializer, <8 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v16f16 = call <16 x half> @llvm.vector.splice.v16f16(<16 x half> zeroinitializer, <16 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2f32 = call <2 x float> @llvm.vector.splice.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4f32 = call <4 x float> @llvm.vector.splice.v4f32(<4 x float> zeroinitializer, <4 x float> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v8f32 = call <8 x float> @llvm.vector.splice.v8f32(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2f64 = call <2 x double> @llvm.vector.splice.v2f64(<2 x double> zeroinitializer, <2 x double> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v4f64 = call <4 x double> @llvm.vector.splice.v4f64(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2bf16 = call <2 x bfloat> @llvm.vector.splice.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4bf16 = call <4 x bfloat> @llvm.vector.splice.v4bf16(<4 x bfloat> zeroinitializer, <4 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8bf16 = call <8 x bfloat> @llvm.vector.splice.v8bf16(<8 x bfloat> zeroinitializer, <8 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.v16bf16 = call <16 x bfloat> @llvm.vector.splice.v16bf16(<16 x bfloat> zeroinitializer, <16 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v16i1 = call <16 x i1> @llvm.vector.splice.v16i1(<16 x i1> zeroinitializer, <16 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v8i1 = call <8 x i1> @llvm.vector.splice.v8i1(<8 x i1> zeroinitializer, <8 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v4i1 = call <4 x i1> @llvm.vector.splice.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice.v2i1 = call <2 x i1> @llvm.vector.splice.v2i1(<2 x i1> zeroinitializer, <2 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %splice.v2i128 = call <2 x i128> @llvm.vector.splice.v2i128(<2 x i128> zeroinitializer, <2 x i128> zeroinitializer, i32 1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %splice.v16i8 = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer, i32 1)
-  %splice.v32i8 = call <32 x i8> @llvm.experimental.vector.splice.v32i8(<32 x i8> zeroinitializer, <32 x i8> zeroinitializer, i32 1)
-  %splice.v2i16 = call <2 x i16> @llvm.experimental.vector.splice.v2i16(<2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i32 1)
-  %splice.v4i16 = call <4 x i16> @llvm.experimental.vector.splice.v4i16(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, i32 1)
-  %splice.v8i16 = call <8 x i16> @llvm.experimental.vector.splice.v8i16(<8 x i16> zeroinitializer, <8 x i16> zeroinitializer, i32 1)
-  %splice.v16i16 = call <16 x i16> @llvm.experimental.vector.splice.v16i16(<16 x i16> zeroinitializer, <16 x i16> zeroinitializer, i32 1)
-  %splice.v4i32 = call <4 x i32> @llvm.experimental.vector.splice.v4i32(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 1)
-  %splice.v8i32 = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 1)
-  %splice.v2i64 = call <2 x i64> @llvm.experimental.vector.splice.v2i64(<2 x i64> zeroinitializer, <2 x i64> zeroinitializer, i32 1)
-  %splice.v4i64 = call <4 x i64> @llvm.experimental.vector.splice.v4i64(<4 x i64> zeroinitializer, <4 x i64> zeroinitializer, i32 1)
-  %splice.v2f16 = call <2 x half> @llvm.experimental.vector.splice.v2f16(<2 x half> zeroinitializer, <2 x half> zeroinitializer, i32 1)
-  %splice.v4f16 = call <4 x half> @llvm.experimental.vector.splice.v4f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, i32 1)
-  %splice.v8f16 = call <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half> zeroinitializer, <8 x half> zeroinitializer, i32 1)
-  %splice.v16f16 = call <16 x half> @llvm.experimental.vector.splice.v16f16(<16 x half> zeroinitializer, <16 x half> zeroinitializer, i32 1)
-  %splice.v2f32 = call <2 x float> @llvm.experimental.vector.splice.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, i32 1)
-  %splice.v4f32 = call <4 x float> @llvm.experimental.vector.splice.v4f32(<4 x float> zeroinitializer, <4 x float> zeroinitializer, i32 1)
-  %splice.v8f32 = call <8 x float> @llvm.experimental.vector.splice.v8f32(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i32 1)
-  %splice.v2f64 = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> zeroinitializer, <2 x double> zeroinitializer, i32 1)
-  %splice.v4f64 = call <4 x double> @llvm.experimental.vector.splice.v4f64(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i32 1)
-  %splice.v2bf16 = call <2 x bfloat> @llvm.experimental.vector.splice.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> zeroinitializer, i32 1)
-  %splice.v4bf16 = call <4 x bfloat> @llvm.experimental.vector.splice.v4bf16(<4 x bfloat> zeroinitializer, <4 x bfloat> zeroinitializer, i32 1)
-  %splice.v8bf16 = call <8 x bfloat> @llvm.experimental.vector.splice.v8bf16(<8 x bfloat> zeroinitializer, <8 x bfloat> zeroinitializer, i32 1)
-  %splice.v16bf16 = call <16 x bfloat> @llvm.experimental.vector.splice.v16bf16(<16 x bfloat> zeroinitializer, <16 x bfloat> zeroinitializer, i32 1)
-  %splice.v16i1 = call <16 x i1> @llvm.experimental.vector.splice.v16i1(<16 x i1> zeroinitializer, <16 x i1> zeroinitializer, i32 1)
-  %splice.v8i1 =  call <8 x i1> @llvm.experimental.vector.splice.v8i1(<8 x i1> zeroinitializer, <8 x i1> zeroinitializer, i32 1)
-  %splice.v4i1 = call <4 x i1> @llvm.experimental.vector.splice.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer, i32 1)
-  %splice.v2i1 = call <2 x i1> @llvm.experimental.vector.splice.v2i1(<2 x i1> zeroinitializer, <2 x i1> zeroinitializer, i32 1)
-  %splice.v2i128 = call <2 x i128> @llvm.experimental.vector.splice.v2i128(<2 x i128> zeroinitializer, <2 x i128> zeroinitializer, i32 1)
+  %splice.v16i8 = call <16 x i8> @llvm.vector.splice.v16i8(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer, i32 1)
+  %splice.v32i8 = call <32 x i8> @llvm.vector.splice.v32i8(<32 x i8> zeroinitializer, <32 x i8> zeroinitializer, i32 1)
+  %splice.v2i16 = call <2 x i16> @llvm.vector.splice.v2i16(<2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i32 1)
+  %splice.v4i16 = call <4 x i16> @llvm.vector.splice.v4i16(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, i32 1)
+  %splice.v8i16 = call <8 x i16> @llvm.vector.splice.v8i16(<8 x i16> zeroinitializer, <8 x i16> zeroinitializer, i32 1)
+  %splice.v16i16 = call <16 x i16> @llvm.vector.splice.v16i16(<16 x i16> zeroinitializer, <16 x i16> zeroinitializer, i32 1)
+  %splice.v4i32 = call <4 x i32> @llvm.vector.splice.v4i32(<4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 1)
+  %splice.v8i32 = call <8 x i32> @llvm.vector.splice.v8i32(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 1)
+  %splice.v2i64 = call <2 x i64> @llvm.vector.splice.v2i64(<2 x i64> zeroinitializer, <2 x i64> zeroinitializer, i32 1)
+  %splice.v4i64 = call <4 x i64> @llvm.vector.splice.v4i64(<4 x i64> zeroinitializer, <4 x i64> zeroinitializer, i32 1)
+  %splice.v2f16 = call <2 x half> @llvm.vector.splice.v2f16(<2 x half> zeroinitializer, <2 x half> zeroinitializer, i32 1)
+  %splice.v4f16 = call <4 x half> @llvm.vector.splice.v4f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, i32 1)
+  %splice.v8f16 = call <8 x half> @llvm.vector.splice.v8f16(<8 x half> zeroinitializer, <8 x half> zeroinitializer, i32 1)
+  %splice.v16f16 = call <16 x half> @llvm.vector.splice.v16f16(<16 x half> zeroinitializer, <16 x half> zeroinitializer, i32 1)
+  %splice.v2f32 = call <2 x float> @llvm.vector.splice.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, i32 1)
+  %splice.v4f32 = call <4 x float> @llvm.vector.splice.v4f32(<4 x float> zeroinitializer, <4 x float> zeroinitializer, i32 1)
+  %splice.v8f32 = call <8 x float> @llvm.vector.splice.v8f32(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i32 1)
+  %splice.v2f64 = call <2 x double> @llvm.vector.splice.v2f64(<2 x double> zeroinitializer, <2 x double> zeroinitializer, i32 1)
+  %splice.v4f64 = call <4 x double> @llvm.vector.splice.v4f64(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i32 1)
+  %splice.v2bf16 = call <2 x bfloat> @llvm.vector.splice.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> zeroinitializer, i32 1)
+  %splice.v4bf16 = call <4 x bfloat> @llvm.vector.splice.v4bf16(<4 x bfloat> zeroinitializer, <4 x bfloat> zeroinitializer, i32 1)
+  %splice.v8bf16 = call <8 x bfloat> @llvm.vector.splice.v8bf16(<8 x bfloat> zeroinitializer, <8 x bfloat> zeroinitializer, i32 1)
+  %splice.v16bf16 = call <16 x bfloat> @llvm.vector.splice.v16bf16(<16 x bfloat> zeroinitializer, <16 x bfloat> zeroinitializer, i32 1)
+  %splice.v16i1 = call <16 x i1> @llvm.vector.splice.v16i1(<16 x i1> zeroinitializer, <16 x i1> zeroinitializer, i32 1)
+  %splice.v8i1 =  call <8 x i1> @llvm.vector.splice.v8i1(<8 x i1> zeroinitializer, <8 x i1> zeroinitializer, i32 1)
+  %splice.v4i1 = call <4 x i1> @llvm.vector.splice.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer, i32 1)
+  %splice.v2i1 = call <2 x i1> @llvm.vector.splice.v2i1(<2 x i1> zeroinitializer, <2 x i1> zeroinitializer, i32 1)
+  %splice.v2i128 = call <2 x i128> @llvm.vector.splice.v2i128(<2 x i128> zeroinitializer, <2 x i128> zeroinitializer, i32 1)
   ret void
 }
 
-declare <2 x i1> @llvm.experimental.vector.splice.v2i1(<2 x i1>, <2 x i1>, i32)
-declare <4 x i1> @llvm.experimental.vector.splice.v4i1(<4 x i1>, <4 x i1>, i32)
-declare <8 x i1> @llvm.experimental.vector.splice.v8i1(<8 x i1>, <8 x i1>, i32)
-declare <16 x i1> @llvm.experimental.vector.splice.v16i1(<16 x i1>, <16 x i1>, i32)
-declare <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8>, <2 x i8>, i32)
-declare <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8>, <16 x i8>, i32)
-declare <32 x i8> @llvm.experimental.vector.splice.v32i8(<32 x i8>, <32 x i8>, i32)
-declare <2 x i16> @llvm.experimental.vector.splice.v2i16(<2 x i16>, <2 x i16>, i32)
-declare <4 x i16> @llvm.experimental.vector.splice.v4i16(<4 x i16>, <4 x i16>, i32)
-declare <8 x i16> @llvm.experimental.vector.splice.v8i16(<8 x i16>, <8 x i16>, i32)
-declare <16 x i16> @llvm.experimental.vector.splice.v16i16(<16 x i16>, <16 x i16>, i32)
-declare <4 x i32> @llvm.experimental.vector.splice.v4i32(<4 x i32>, <4 x i32>, i32)
-declare <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32>, <8 x i32>, i32)
-declare <2 x i64> @llvm.experimental.vector.splice.v2i64(<2 x i64>, <2 x i64>, i32)
-declare <4 x i64> @llvm.experimental.vector.splice.v4i64(<4 x i64>, <4 x i64>, i32)
-declare <2 x half> @llvm.experimental.vector.splice.v2f16(<2 x half>, <2 x half>, i32)
-declare <4 x half> @llvm.experimental.vector.splice.v4f16(<4 x half>, <4 x half>, i32)
-declare <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half>, <8 x half>, i32)
-declare <16 x half> @llvm.experimental.vector.splice.v16f16(<16 x half>, <16 x half>, i32)
-declare <2 x bfloat> @llvm.experimental.vector.splice.v2bf16(<2 x bfloat>, <2 x bfloat>, i32)
-declare <4 x bfloat> @llvm.experimental.vector.splice.v4bf16(<4 x bfloat>, <4 x bfloat>, i32)
-declare <8 x bfloat> @llvm.experimental.vector.splice.v8bf16(<8 x bfloat>, <8 x bfloat>, i32)
-declare <16 x bfloat> @llvm.experimental.vector.splice.v16bf16(<16 x bfloat>, <16 x bfloat>, i32)
-declare <2 x float> @llvm.experimental.vector.splice.v2f32(<2 x float>, <2 x float>, i32)
-declare <4 x float> @llvm.experimental.vector.splice.v4f32(<4 x float>, <4 x float>, i32)
-declare <8 x float> @llvm.experimental.vector.splice.v8f32(<8 x float>, <8 x float>, i32)
-declare <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float>, <16 x float>, i32)
-declare <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double>, <2 x double>, i32)
-declare <4 x double> @llvm.experimental.vector.splice.v4f64(<4 x double>, <4 x double>, i32)
-declare <2 x i128> @llvm.experimental.vector.splice.v2i128(<2 x i128>, <2 x i128>, i32)
+declare <2 x i1> @llvm.vector.splice.v2i1(<2 x i1>, <2 x i1>, i32)
+declare <4 x i1> @llvm.vector.splice.v4i1(<4 x i1>, <4 x i1>, i32)
+declare <8 x i1> @llvm.vector.splice.v8i1(<8 x i1>, <8 x i1>, i32)
+declare <16 x i1> @llvm.vector.splice.v16i1(<16 x i1>, <16 x i1>, i32)
+declare <2 x i8> @llvm.vector.splice.v2i8(<2 x i8>, <2 x i8>, i32)
+declare <16 x i8> @llvm.vector.splice.v16i8(<16 x i8>, <16 x i8>, i32)
+declare <32 x i8> @llvm.vector.splice.v32i8(<32 x i8>, <32 x i8>, i32)
+declare <2 x i16> @llvm.vector.splice.v2i16(<2 x i16>, <2 x i16>, i32)
+declare <4 x i16> @llvm.vector.splice.v4i16(<4 x i16>, <4 x i16>, i32)
+declare <8 x i16> @llvm.vector.splice.v8i16(<8 x i16>, <8 x i16>, i32)
+declare <16 x i16> @llvm.vector.splice.v16i16(<16 x i16>, <16 x i16>, i32)
+declare <4 x i32> @llvm.vector.splice.v4i32(<4 x i32>, <4 x i32>, i32)
+declare <8 x i32> @llvm.vector.splice.v8i32(<8 x i32>, <8 x i32>, i32)
+declare <2 x i64> @llvm.vector.splice.v2i64(<2 x i64>, <2 x i64>, i32)
+declare <4 x i64> @llvm.vector.splice.v4i64(<4 x i64>, <4 x i64>, i32)
+declare <2 x half> @llvm.vector.splice.v2f16(<2 x half>, <2 x half>, i32)
+declare <4 x half> @llvm.vector.splice.v4f16(<4 x half>, <4 x half>, i32)
+declare <8 x half> @llvm.vector.splice.v8f16(<8 x half>, <8 x half>, i32)
+declare <16 x half> @llvm.vector.splice.v16f16(<16 x half>, <16 x half>, i32)
+declare <2 x bfloat> @llvm.vector.splice.v2bf16(<2 x bfloat>, <2 x bfloat>, i32)
+declare <4 x bfloat> @llvm.vector.splice.v4bf16(<4 x bfloat>, <4 x bfloat>, i32)
+declare <8 x bfloat> @llvm.vector.splice.v8bf16(<8 x bfloat>, <8 x bfloat>, i32)
+declare <16 x bfloat> @llvm.vector.splice.v16bf16(<16 x bfloat>, <16 x bfloat>, i32)
+declare <2 x float> @llvm.vector.splice.v2f32(<2 x float>, <2 x float>, i32)
+declare <4 x float> @llvm.vector.splice.v4f32(<4 x float>, <4 x float>, i32)
+declare <8 x float> @llvm.vector.splice.v8f32(<8 x float>, <8 x float>, i32)
+declare <16 x float> @llvm.vector.splice.v16f32(<16 x float>, <16 x float>, i32)
+declare <2 x double> @llvm.vector.splice.v2f64(<2 x double>, <2 x double>, i32)
+declare <4 x double> @llvm.vector.splice.v4f64(<4 x double>, <4 x double>, i32)
+declare <2 x i128> @llvm.vector.splice.v2i128(<2 x i128>, <2 x i128>, i32)
 
 attributes #0 = { "target-features"="+bf16" }
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index 7ce3021b0093..15c278b060c9 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -270,122 +270,122 @@ declare <vscale x 4 x i32> @llvm.cttz.nxv4i32(<vscale x 4 x i32>, i1)
 
 define void @vector_reverse() #0 {
 ; CHECK-LABEL: 'vector_reverse'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.experimental.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.experimental.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.experimental.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.experimental.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'vector_reverse'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.experimental.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.experimental.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.experimental.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.experimental.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
-  %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-  %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-  %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-  %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-  %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-  %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-  %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-  %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-  %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-  %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-  %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
-  %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
-  %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
-  %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
-  %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
-  %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
-  %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
-  %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
-  %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
-  %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.experimental.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
-  %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.experimental.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
-  %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.experimental.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
-  %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.experimental.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
-  %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-  %reverse_nxv8i1 =  call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-  %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-  %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+  %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+  %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+  %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
+  %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
+  %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+  %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+  %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+  %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+  %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+  %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+  %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
+  %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
+  %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
+  %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
+  %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
+  %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
+  %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
+  %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
+  %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
+  %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
+  %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
+  %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
+  %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
+  %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+  %reverse_nxv8i1 =  call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+  %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+  %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
   ret void
 }
-declare <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8>)
-declare <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8>)
-declare <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16>)
-declare <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32>)
-declare <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64>)
-declare <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half>)
-declare <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half>)
-declare <vscale x 16 x half> @llvm.experimental.vector.reverse.nxv16f16(<vscale x 16 x half>)
-declare <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.experimental.vector.reverse.nxv8f32(<vscale x 8 x float>)
-declare <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double>)
-declare <vscale x 2 x bfloat> @llvm.experimental.vector.reverse.nxv2bf16(<vscale x 2 x bfloat>)
-declare <vscale x 4 x bfloat> @llvm.experimental.vector.reverse.nxv4bf16(<vscale x 4 x bfloat>)
-declare <vscale x 8 x bfloat> @llvm.experimental.vector.reverse.nxv8bf16(<vscale x 8 x bfloat>)
-declare <vscale x 16 x bfloat> @llvm.experimental.vector.reverse.nxv16bf16(<vscale x 16 x bfloat>)
-declare <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1>)
-declare <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1>)
-declare <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1>)
-declare <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1>)
+declare <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8>)
+declare <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8>)
+declare <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16>)
+declare <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16>)
+declare <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16>)
+declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32>)
+declare <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64>)
+declare <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64>)
+declare <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half>)
+declare <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half>)
+declare <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half>)
+declare <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half>)
+declare <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float>)
+declare <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float>)
+declare <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat>)
+declare <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat>)
+declare <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1>)
 
 define void @unsupported_fp_ops(<vscale x 4 x float> %vec, i32 %extraarg) {
 ; CHECK-LABEL: 'unsupported_fp_ops'
@@ -450,236 +450,236 @@ declare <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float>)
 
 define void @vector_splice() #0 {
 ; CHECK-LABEL: 'vector_splice'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.experimental.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.experimental.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.experimental.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.experimental.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv16i8_neg = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv32i8_neg = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i16_neg = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i16_neg = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i16_neg = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8i16_neg = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16i16_neg = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i32_neg = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv8i32_neg = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i64_neg = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i64_neg = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv4i64_neg = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f16_neg = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2f16_neg = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4f16_neg = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8f16_neg = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16f16_neg = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f32_neg = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2f32_neg = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4f32_neg = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv8f32_neg = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f64_neg = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2f64_neg = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv4f64_neg = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1bf16_neg = call <vscale x 1 x bfloat> @llvm.experimental.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2bf16_neg = call <vscale x 2 x bfloat> @llvm.experimental.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4bf16_neg = call <vscale x 4 x bfloat> @llvm.experimental.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8bf16_neg = call <vscale x 8 x bfloat> @llvm.experimental.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16bf16_neg = call <vscale x 16 x bfloat> @llvm.experimental.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv8i1_neg = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i1_neg = call <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8f16 = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16f16 = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f32 = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4f32 = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8f32 = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2f64 = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4f64 = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv16i8_neg = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv32i8_neg = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i16_neg = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i16_neg = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i16_neg = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8i16_neg = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16i16_neg = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i32_neg = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv8i32_neg = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i64_neg = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i64_neg = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv4i64_neg = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f16_neg = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2f16_neg = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4f16_neg = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8f16_neg = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16f16_neg = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f32_neg = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2f32_neg = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4f32_neg = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv8f32_neg = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f64_neg = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2f64_neg = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv4f64_neg = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1bf16_neg = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2bf16_neg = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4bf16_neg = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8bf16_neg = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16bf16_neg = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv8i1_neg = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i1_neg = call <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 -1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'vector_splice'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.experimental.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.experimental.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.experimental.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.experimental.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i8_neg = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv32i8_neg = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i16_neg = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i16_neg = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i16_neg = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i16_neg = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i16_neg = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i32_neg = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i32_neg = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i64_neg = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i64_neg = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i64_neg = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f16_neg = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f16_neg = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f16_neg = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f16_neg = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16f16_neg = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f32_neg = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f32_neg = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f32_neg = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f32_neg = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f64_neg = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f64_neg = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f64_neg = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1bf16_neg = call <vscale x 1 x bfloat> @llvm.experimental.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2bf16_neg = call <vscale x 2 x bfloat> @llvm.experimental.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4bf16_neg = call <vscale x 4 x bfloat> @llvm.experimental.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8bf16_neg = call <vscale x 8 x bfloat> @llvm.experimental.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16bf16_neg = call <vscale x 16 x bfloat> @llvm.experimental.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i1_neg = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i1_neg = call <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f16 = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16f16 = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f32 = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f32 = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f32 = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f64 = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f64 = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i8_neg = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv32i8_neg = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i16_neg = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i16_neg = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i16_neg = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i16_neg = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i16_neg = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i32_neg = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i32_neg = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i64_neg = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i64_neg = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i64_neg = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f16_neg = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f16_neg = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f16_neg = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f16_neg = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16f16_neg = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f32_neg = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f32_neg = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f32_neg = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8f32_neg = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1f64_neg = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2f64_neg = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4f64_neg = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1bf16_neg = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2bf16_neg = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4bf16_neg = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8bf16_neg = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16bf16_neg = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv8i1_neg = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i1_neg = call <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 -1)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
-  %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
-  %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
-  %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
-  %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
-  %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
-  %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
-  %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
-  %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
-  %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
-  %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
-  %splice_nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 1)
-  %splice_nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 1)
-  %splice_nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 1)
-  %splice_nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 1)
-  %splice_nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 1)
-  %splice_nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 1)
-  %splice_nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 1)
-  %splice_nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 1)
-  %splice_nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 1)
-  %splice_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.experimental.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 1)
-  %splice_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.experimental.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 1)
-  %splice_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.experimental.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 1)
-  %splice_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.experimental.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 1)
-  %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
-  %splice_nxv8i1 =  call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
-  %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
-  %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+  %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
+  %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
+  %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
+  %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
+  %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
+  %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
+  %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
+  %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
+  %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
+  %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
+  %splice_nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 1)
+  %splice_nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 1)
+  %splice_nxv8f16 = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 1)
+  %splice_nxv16f16 = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 1)
+  %splice_nxv2f32 = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 1)
+  %splice_nxv4f32 = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 1)
+  %splice_nxv8f32 = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 1)
+  %splice_nxv2f64 = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 1)
+  %splice_nxv4f64 = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 1)
+  %splice_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 1)
+  %splice_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 1)
+  %splice_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 1)
+  %splice_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 1)
+  %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+  %splice_nxv8i1 =  call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+  %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+  %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
 ;; negative Index
-  %splice_nxv16i8_neg = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
-  %splice_nxv32i8_neg = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
-  %splice_nxv1i16_neg = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
-  %splice_nxv2i16_neg = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
-  %splice_nxv4i16_neg = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
-  %splice_nxv8i16_neg = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
-  %splice_nxv16i16_neg = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
-  %splice_nxv4i32_neg = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
-  %splice_nxv8i32_neg = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
-  %splice_nxv1i64_neg= call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
-  %splice_nxv2i64_neg= call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
-  %splice_nxv4i64_neg = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
-  %splice_nxv1f16_neg = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
-  %splice_nxv2f16_neg = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
-  %splice_nxv4f16_neg = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
-  %splice_nxv8f16_neg = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
-  %splice_nxv16f16_neg = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
-  %splice_nxv1f32_neg = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
-  %splice_nxv2f32_neg = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
-  %splice_nxv4f32_neg = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
-  %splice_nxv8f32_neg = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
-  %splice_nxv1f64_neg = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
-  %splice_nxv2f64_neg = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
-  %splice_nxv4f64_neg = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
-  %splice_nxv1bf16_neg = call <vscale x 1 x bfloat> @llvm.experimental.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
-  %splice_nxv2bf16_neg = call <vscale x 2 x bfloat> @llvm.experimental.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
-  %splice_nxv4bf16_neg = call <vscale x 4 x bfloat> @llvm.experimental.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
-  %splice_nxv8bf16_neg = call <vscale x 8 x bfloat> @llvm.experimental.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
-  %splice_nxv16bf16_neg = call <vscale x 16 x bfloat> @llvm.experimental.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
-  %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
-  %splice_nxv8i1_neg =  call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
-  %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
-  %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
-  %splice_nxv1i1_neg = call <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 -1)
+  %splice_nxv16i8_neg = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
+  %splice_nxv32i8_neg = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
+  %splice_nxv1i16_neg = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
+  %splice_nxv2i16_neg = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
+  %splice_nxv4i16_neg = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
+  %splice_nxv8i16_neg = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
+  %splice_nxv16i16_neg = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
+  %splice_nxv4i32_neg = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
+  %splice_nxv8i32_neg = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
+  %splice_nxv1i64_neg= call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
+  %splice_nxv2i64_neg= call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
+  %splice_nxv4i64_neg = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
+  %splice_nxv1f16_neg = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
+  %splice_nxv2f16_neg = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
+  %splice_nxv4f16_neg = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
+  %splice_nxv8f16_neg = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
+  %splice_nxv16f16_neg = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
+  %splice_nxv1f32_neg = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
+  %splice_nxv2f32_neg = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
+  %splice_nxv4f32_neg = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
+  %splice_nxv8f32_neg = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
+  %splice_nxv1f64_neg = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
+  %splice_nxv2f64_neg = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
+  %splice_nxv4f64_neg = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
+  %splice_nxv1bf16_neg = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
+  %splice_nxv2bf16_neg = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
+  %splice_nxv4bf16_neg = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
+  %splice_nxv8bf16_neg = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
+  %splice_nxv16bf16_neg = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
+  %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
+  %splice_nxv8i1_neg =  call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
+  %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
+  %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
+  %splice_nxv1i1_neg = call <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 -1)
   ret void
 }
 
-declare <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1>, <vscale x 1 x i1>, i32)
-declare <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
-declare <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
-declare <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
-declare <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
-declare <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, i32)
-declare <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
-declare <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
-declare <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
-declare <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, i32)
-declare <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
-declare <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
-declare <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
-declare <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
-declare <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
-declare <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
-declare <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, i32)
-declare <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
-declare <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
-declare <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, i32)
-declare <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
-declare <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
-declare <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
-declare <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>, i32)
-declare <vscale x 1 x bfloat> @llvm.experimental.vector.splice.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, i32)
-declare <vscale x 2 x bfloat> @llvm.experimental.vector.splice.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, i32)
-declare <vscale x 4 x bfloat> @llvm.experimental.vector.splice.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, i32)
-declare <vscale x 8 x bfloat> @llvm.experimental.vector.splice.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
-declare <vscale x 16 x bfloat> @llvm.experimental.vector.splice.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>, i32)
-declare <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float>, <vscale x 1 x float>, i32)
-declare <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
-declare <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
-declare <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>, i32)
-declare <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
-declare <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, i32)
-declare <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
-declare <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>, i32)
+declare <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1>, <vscale x 1 x i1>, i32)
+declare <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
+declare <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, i32)
+declare <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
+declare <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
+declare <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
+declare <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, i32)
+declare <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
+declare <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
+declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+declare <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
+declare <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, i32)
+declare <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
+declare <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
+declare <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, i32)
+declare <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
+declare <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
+declare <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
+declare <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>, i32)
+declare <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, i32)
+declare <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, i32)
+declare <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, i32)
+declare <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
+declare <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>, i32)
+declare <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float>, <vscale x 1 x float>, i32)
+declare <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
+declare <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
+declare <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>, i32)
+declare <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
+declare <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, i32)
+declare <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
+declare <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>, i32)
 
 define void @get_lane_mask() #0 {
 ; CHECK-LABEL: 'get_lane_mask'
diff --git a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
index 306277e46fa5..1dde88f366a3 100644
--- a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-vector-bits-min=128 < %s | FileCheck %s
 ; Check that we don't crash querying costs when vectors are not enabled.
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64
 
@@ -252,8 +252,8 @@ define i32 @fdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = fdiv <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F16 = fdiv <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F16 = fdiv <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F16 = fdiv <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32F16 = fdiv <32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fdiv <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = fdiv <32 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = fdiv <vscale x 1 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = fdiv <vscale x 2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = fdiv <vscale x 4 x half> undef, undef
@@ -263,8 +263,8 @@ define i32 @fdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F32 = fdiv <1 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = fdiv <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = fdiv <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = fdiv <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = fdiv <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fdiv <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = fdiv <16 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F32 = fdiv <vscale x 1 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F32 = fdiv <vscale x 2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = fdiv <vscale x 4 x float> undef, undef
@@ -272,8 +272,8 @@ define i32 @fdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV16F32 = fdiv <vscale x 16 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1F64 = fdiv <1 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = fdiv <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = fdiv <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = fdiv <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fdiv <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = fdiv <8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F64 = fdiv <vscale x 1 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = fdiv <vscale x 2 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F64 = fdiv <vscale x 4 x double> undef, undef
@@ -332,8 +332,8 @@ define i32 @frem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F16 = frem <2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F16 = frem <4 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F16 = frem <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16F16 = frem <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32F16 = frem <32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16F16 = frem <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V32F16 = frem <32 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16 = frem <vscale x 1 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16 = frem <vscale x 2 x half> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16 = frem <vscale x 4 x half> undef, undef
@@ -343,8 +343,8 @@ define i32 @frem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = frem <1 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32 = frem <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F32 = frem <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F32 = frem <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16F32 = frem <16 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = frem <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16F32 = frem <16 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F32 = frem <vscale x 1 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F32 = frem <vscale x 2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F32 = frem <vscale x 4 x float> undef, undef
@@ -352,8 +352,8 @@ define i32 @frem() {
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F32 = frem <vscale x 16 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = frem <1 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64 = frem <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = frem <8 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = frem <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8F64 = frem <8 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F64 = frem <vscale x 1 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F64 = frem <vscale x 2 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F64 = frem <vscale x 4 x double> undef, undef
@@ -492,8 +492,8 @@ define i32 @fcopysign() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = call <32 x half> @llvm.copysign.v32f16(<32 x half> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32F16 = call <32 x half> @llvm.copysign.v32f16(<32 x half> undef, <32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = call <vscale x 1 x half> @llvm.copysign.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = call <vscale x 2 x half> @llvm.copysign.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = call <vscale x 4 x half> @llvm.copysign.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef)
@@ -503,8 +503,8 @@ define i32 @fcopysign() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.copysign.v1f32(<1 x float> undef, <1 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.copysign.v2f32(<2 x float> undef, <2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = call <vscale x 1 x float> @llvm.copysign.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = call <vscale x 2 x float> @llvm.copysign.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F32 = call <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef)
@@ -512,8 +512,8 @@ define i32 @fcopysign() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16F32 = call <vscale x 16 x float> @llvm.copysign.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.copysign.v1f64(<1 x double> undef, <1 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = call <vscale x 1 x double> @llvm.copysign.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F64 = call <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F64 = call <vscale x 4 x double> @llvm.copysign.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef)
@@ -572,8 +572,8 @@ define i32 @fma() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = call <8 x half> @llvm.fma.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32F16 = call <32 x half> @llvm.fma.v32f16(<32 x half> undef, <32 x half> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32F16 = call <32 x half> @llvm.fma.v32f16(<32 x half> undef, <32 x half> undef, <32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = call <vscale x 1 x half> @llvm.fma.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = call <vscale x 2 x half> @llvm.fma.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = call <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
@@ -583,8 +583,8 @@ define i32 @fma() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.fma.v1f32(<1 x float> undef, <1 x float> undef, <1 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = call <vscale x 1 x float> @llvm.fma.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef, <vscale x 1 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = call <vscale x 2 x float> @llvm.fma.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F32 = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
@@ -592,8 +592,8 @@ define i32 @fma() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16F32 = call <vscale x 16 x float> @llvm.fma.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, <vscale x 16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.fma.v1f64(<1 x double> undef, <1 x double> undef, <1 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = call <vscale x 1 x double> @llvm.fma.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef, <vscale x 1 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2F64 = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4F64 = call <vscale x 4 x double> @llvm.fma.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
@@ -651,15 +651,15 @@ define void @fmuladd() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %6 = call <8 x half> @llvm.fmuladd.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %7 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = call <16 x half> @llvm.fmuladd.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %11 = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %10 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %11 = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %13 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %14 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %15 = call <16 x double> @llvm.fmuladd.v16f64(<16 x double> undef, <16 x double> undef, <16 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %13 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = call <16 x double> @llvm.fmuladd.v16f64(<16 x double> undef, <16 x double> undef, <16 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 1 x half> @llvm.fmuladd.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <vscale x 2 x half> @llvm.fmuladd.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef, <vscale x 2 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <vscale x 4 x half> @llvm.fmuladd.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/arith-int.ll b/llvm/test/Analysis/CostModel/RISCV/arith-int.ll
index 00f2cd7b63a4..b4afbb513166 100644
--- a/llvm/test/Analysis/CostModel/RISCV/arith-int.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/arith-int.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh -riscv-v-fixed-length-vector-lmul-max=1 < %s | FileCheck %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mcpu=sifive-x280 -riscv-v-fixed-length-vector-lmul-max=1 < %s | FileCheck %s --check-prefix=SIFIVE-X280
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64 -mcpu=sifive-x280 < %s | FileCheck %s --check-prefix=SIFIVE-X280
 ; Check that we don't crash querying costs when vectors are not enabled.
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv64
 
@@ -709,8 +709,8 @@ define i32 @udiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = udiv <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = udiv <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = udiv <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = udiv <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = udiv <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = udiv <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = udiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I16 = udiv <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I16 = udiv <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I16 = udiv <vscale x 4 x i16> undef, undef
@@ -720,8 +720,8 @@ define i32 @udiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = udiv <1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = udiv <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = udiv <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = udiv <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = udiv <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = udiv <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = udiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I32 = udiv <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I32 = udiv <vscale x 2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I32 = udiv <vscale x 4 x i32> undef, undef
@@ -729,8 +729,8 @@ define i32 @udiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I32 = udiv <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = udiv <1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = udiv <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = udiv <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = udiv <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = udiv <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = udiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I64 = udiv <vscale x 1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I64 = udiv <vscale x 2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I64 = udiv <vscale x 4 x i64> undef, undef
@@ -825,8 +825,8 @@ define i32 @urem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = urem <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = urem <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = urem <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = urem <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = urem <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = urem <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = urem <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I16 = urem <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I16 = urem <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I16 = urem <vscale x 4 x i16> undef, undef
@@ -836,8 +836,8 @@ define i32 @urem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = urem <1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = urem <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = urem <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = urem <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = urem <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = urem <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = urem <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I32 = urem <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I32 = urem <vscale x 2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I32 = urem <vscale x 4 x i32> undef, undef
@@ -845,8 +845,8 @@ define i32 @urem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I32 = urem <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = urem <1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = urem <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = urem <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = urem <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = urem <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = urem <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I64 = urem <vscale x 1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I64 = urem <vscale x 2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I64 = urem <vscale x 4 x i64> undef, undef
@@ -941,8 +941,8 @@ define i32 @sdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = sdiv <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = sdiv <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = sdiv <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = sdiv <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = sdiv <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = sdiv <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = sdiv <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I16 = sdiv <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I16 = sdiv <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I16 = sdiv <vscale x 4 x i16> undef, undef
@@ -952,8 +952,8 @@ define i32 @sdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = sdiv <1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = sdiv <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = sdiv <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = sdiv <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = sdiv <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = sdiv <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = sdiv <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I32 = sdiv <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I32 = sdiv <vscale x 2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I32 = sdiv <vscale x 4 x i32> undef, undef
@@ -961,8 +961,8 @@ define i32 @sdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I32 = sdiv <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = sdiv <1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = sdiv <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = sdiv <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = sdiv <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = sdiv <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = sdiv <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I64 = sdiv <vscale x 1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I64 = sdiv <vscale x 2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I64 = sdiv <vscale x 4 x i64> undef, undef
@@ -1057,8 +1057,8 @@ define i32 @srem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = srem <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = srem <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = srem <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = srem <16 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = srem <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = srem <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = srem <32 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I16 = srem <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I16 = srem <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I16 = srem <vscale x 4 x i16> undef, undef
@@ -1068,8 +1068,8 @@ define i32 @srem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = srem <1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = srem <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = srem <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = srem <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = srem <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = srem <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = srem <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I32 = srem <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I32 = srem <vscale x 2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I32 = srem <vscale x 4 x i32> undef, undef
@@ -1077,8 +1077,8 @@ define i32 @srem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV16I32 = srem <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = srem <1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = srem <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = srem <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = srem <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = srem <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = srem <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1I64 = srem <vscale x 1 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2I64 = srem <vscale x 2 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4I64 = srem <vscale x 4 x i64> undef, undef
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
index 7cc7cff0e6e8..e068ab638d3a 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
@@ -78,148 +78,148 @@ declare <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x
 
 define void @vector_reverse() {
 ; CHECK-LABEL: 'vector_reverse'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 332 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 332 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'vector_reverse'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-  %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-  %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-  %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-  %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-  %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-  %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-  %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-  %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-  %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-  %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
-  %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
-  %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
-  %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-  %reverse_nxv8i1 =  call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-  %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-  %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+  %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+  %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+  %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
+  %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
+  %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+  %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+  %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+  %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+  %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+  %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+  %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
+  %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
+  %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
+  %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+  %reverse_nxv8i1 =  call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+  %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+  %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
   ret void
 }
 
-declare <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8>)
-declare <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8>)
-declare <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16>)
-declare <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32>)
-declare <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64>)
-declare <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64>)
-declare <vscale x 16 x i64> @llvm.experimental.vector.reverse.nxv16i64(<vscale x 16 x i64>)
-declare <vscale x 32 x i64> @llvm.experimental.vector.reverse.nxv32i64(<vscale x 32 x i64>)
-declare <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1>)
-declare <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1>)
-declare <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1>)
-declare <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1>)
+declare <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8>)
+declare <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8>)
+declare <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16>)
+declare <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16>)
+declare <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16>)
+declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32>)
+declare <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64>)
+declare <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64>)
+declare <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64>)
+declare <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64>)
+declare <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64>)
+declare <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1>)
 
 
 define void @vector_splice() {
 ; CHECK-LABEL: 'vector_splice'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'vector_splice'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
-  %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
-  %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
-  %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
-  %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
-  %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
-  %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
-  %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
-  %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
-  %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
-  %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
-  %splice_nxv8i1 =  call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
-  %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
-  %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+  %splice_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 1)
+  %splice_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 1)
+  %splice_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 1)
+  %splice_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 1)
+  %splice_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 1)
+  %splice_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 1)
+  %splice_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 1)
+  %splice_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 1)
+  %splice_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 1)
+  %splice_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 1)
+  %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+  %splice_nxv8i1 =  call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+  %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+  %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
   ret void
 }
 
-declare <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
-declare <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
-declare <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
-declare <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
-declare <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
-declare <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
-declare <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
-declare <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
-declare <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
-declare <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
-declare <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
-declare <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
-declare <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
-declare <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
-declare <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
+declare <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
+declare <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
+declare <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
+declare <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
+declare <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
+declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+declare <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
+declare <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
+declare <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
diff --git a/llvm/test/Analysis/CostModel/RISCV/splice.ll b/llvm/test/Analysis/CostModel/RISCV/splice.ll
index c70c879dba5a..9acccef9c4f6 100644
--- a/llvm/test/Analysis/CostModel/RISCV/splice.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/splice.ll
@@ -4,220 +4,220 @@
 
 define void @vector_splice() {
 ; CHECK-LABEL: 'vector_splice'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i8 = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i8 = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i8 = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i8 = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64i8 = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i16 = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv32i16 = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv64i16 = call <vscale x 64 x i16> @llvm.experimental.vector.splice.nxv64i16(<vscale x 64 x i16> zeroinitializer, <vscale x 64 x i16> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i32 = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> zeroinitializer, <vscale x 1 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i32 = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv16i32 = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv32i32 = call <vscale x 32 x i32> @llvm.experimental.vector.splice.nxv32i32(<vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv64i32 = call <vscale x 64 x i32> @llvm.experimental.vector.splice.nxv64i32(<vscale x 64 x i32> zeroinitializer, <vscale x 64 x i32> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i64 = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.experimental.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f16 = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv32f16 = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> zeroinitializer, <vscale x 32 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv64f16 = call <vscale x 64 x half> @llvm.experimental.vector.splice.nxv64f16(<vscale x 64 x half> zeroinitializer, <vscale x 64 x half> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f32 = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv16f32 = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> zeroinitializer, <vscale x 16 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv32f32 = call <vscale x 32 x float> @llvm.experimental.vector.splice.nxv32f32(<vscale x 32 x float> zeroinitializer, <vscale x 32 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv64f32 = call <vscale x 64 x float> @llvm.experimental.vector.splice.nxv64f32(<vscale x 64 x float> zeroinitializer, <vscale x 64 x float> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f64 = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv8f64 = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> zeroinitializer, <vscale x 8 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv16f64 = call <vscale x 16 x double> @llvm.experimental.vector.splice.nxv16f64(<vscale x 16 x double> zeroinitializer, <vscale x 16 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv32f64 = call <vscale x 32 x double> @llvm.experimental.vector.splice.nxv32f64(<vscale x 32 x double> zeroinitializer, <vscale x 32 x double> zeroinitializer, i32 -1)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %splice.nxv64f64 = call <vscale x 64 x double> @llvm.experimental.vector.splice.nxv64f64(<vscale x 64 x double> zeroinitializer, <vscale x 64 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i8 = call <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i8 = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i8 = call <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i8 = call <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64i8 = call <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i16 = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv32i16 = call <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv64i16 = call <vscale x 64 x i16> @llvm.vector.splice.nxv64i16(<vscale x 64 x i16> zeroinitializer, <vscale x 64 x i16> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i32 = call <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32> zeroinitializer, <vscale x 1 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i32 = call <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv16i32 = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv32i32 = call <vscale x 32 x i32> @llvm.vector.splice.nxv32i32(<vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv64i32 = call <vscale x 64 x i32> @llvm.vector.splice.nxv64i32(<vscale x 64 x i32> zeroinitializer, <vscale x 64 x i32> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i64 = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv8i64 = call <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f16 = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv8f16 = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv16f16 = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv32f16 = call <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half> zeroinitializer, <vscale x 32 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv64f16 = call <vscale x 64 x half> @llvm.vector.splice.nxv64f16(<vscale x 64 x half> zeroinitializer, <vscale x 64 x half> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f32 = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f32 = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv4f32 = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv8f32 = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv16f32 = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> zeroinitializer, <vscale x 16 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv32f32 = call <vscale x 32 x float> @llvm.vector.splice.nxv32f32(<vscale x 32 x float> zeroinitializer, <vscale x 32 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv64f32 = call <vscale x 64 x float> @llvm.vector.splice.nxv64f32(<vscale x 64 x float> zeroinitializer, <vscale x 64 x float> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f64 = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv2f64 = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv4f64 = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv8f64 = call <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double> zeroinitializer, <vscale x 8 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv16f64 = call <vscale x 16 x double> @llvm.vector.splice.nxv16f64(<vscale x 16 x double> zeroinitializer, <vscale x 16 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv32f64 = call <vscale x 32 x double> @llvm.vector.splice.nxv32f64(<vscale x 32 x double> zeroinitializer, <vscale x 32 x double> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %splice.nxv64f64 = call <vscale x 64 x double> @llvm.vector.splice.nxv64f64(<vscale x 64 x double> zeroinitializer, <vscale x 64 x double> zeroinitializer, i32 -1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'vector_splice'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i8 = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i8 = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i8 = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i8 = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv64i8 = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i16 = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv32i16 = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv64i16 = call <vscale x 64 x i16> @llvm.experimental.vector.splice.nxv64i16(<vscale x 64 x i16> zeroinitializer, <vscale x 64 x i16> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i32 = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> zeroinitializer, <vscale x 1 x i32> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i32 = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16i32 = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv32i32 = call <vscale x 32 x i32> @llvm.experimental.vector.splice.nxv32i32(<vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv64i32 = call <vscale x 64 x i32> @llvm.experimental.vector.splice.nxv64i32(<vscale x 64 x i32> zeroinitializer, <vscale x 64 x i32> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i64 = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.experimental.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f16 = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv32f16 = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> zeroinitializer, <vscale x 32 x half> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv64f16 = call <vscale x 64 x half> @llvm.experimental.vector.splice.nxv64f16(<vscale x 64 x half> zeroinitializer, <vscale x 64 x half> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f32 = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16f32 = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> zeroinitializer, <vscale x 16 x float> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv32f32 = call <vscale x 32 x float> @llvm.experimental.vector.splice.nxv32f32(<vscale x 32 x float> zeroinitializer, <vscale x 32 x float> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv64f32 = call <vscale x 64 x float> @llvm.experimental.vector.splice.nxv64f32(<vscale x 64 x float> zeroinitializer, <vscale x 64 x float> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f64 = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8f64 = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> zeroinitializer, <vscale x 8 x double> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16f64 = call <vscale x 16 x double> @llvm.experimental.vector.splice.nxv16f64(<vscale x 16 x double> zeroinitializer, <vscale x 16 x double> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32f64 = call <vscale x 32 x double> @llvm.experimental.vector.splice.nxv32f64(<vscale x 32 x double> zeroinitializer, <vscale x 32 x double> zeroinitializer, i32 -1)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64f64 = call <vscale x 64 x double> @llvm.experimental.vector.splice.nxv64f64(<vscale x 64 x double> zeroinitializer, <vscale x 64 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i8 = call <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i8 = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i8 = call <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i8 = call <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv64i8 = call <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i16 = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv32i16 = call <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv64i16 = call <vscale x 64 x i16> @llvm.vector.splice.nxv64i16(<vscale x 64 x i16> zeroinitializer, <vscale x 64 x i16> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i32 = call <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32> zeroinitializer, <vscale x 1 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i32 = call <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16i32 = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv32i32 = call <vscale x 32 x i32> @llvm.vector.splice.nxv32i32(<vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv64i32 = call <vscale x 64 x i32> @llvm.vector.splice.nxv64i32(<vscale x 64 x i32> zeroinitializer, <vscale x 64 x i32> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1i64 = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8i64 = call <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f16 = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8f16 = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16f16 = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv32f16 = call <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half> zeroinitializer, <vscale x 32 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv64f16 = call <vscale x 64 x half> @llvm.vector.splice.nxv64f16(<vscale x 64 x half> zeroinitializer, <vscale x 64 x half> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f32 = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f32 = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f32 = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8f32 = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv16f32 = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> zeroinitializer, <vscale x 16 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv32f32 = call <vscale x 32 x float> @llvm.vector.splice.nxv32f32(<vscale x 32 x float> zeroinitializer, <vscale x 32 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv64f32 = call <vscale x 64 x float> @llvm.vector.splice.nxv64f32(<vscale x 64 x float> zeroinitializer, <vscale x 64 x float> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f64 = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f64 = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f64 = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv8f64 = call <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double> zeroinitializer, <vscale x 8 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16f64 = call <vscale x 16 x double> @llvm.vector.splice.nxv16f64(<vscale x 16 x double> zeroinitializer, <vscale x 16 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32f64 = call <vscale x 32 x double> @llvm.vector.splice.nxv32f64(<vscale x 32 x double> zeroinitializer, <vscale x 32 x double> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64f64 = call <vscale x 64 x double> @llvm.vector.splice.nxv64f64(<vscale x 64 x double> zeroinitializer, <vscale x 64 x double> zeroinitializer, i32 -1)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %splice.nxv1i8 = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8> zeroinitializer, i32 -1)
-  %splice.nxv2i8 = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> zeroinitializer, i32 -1)
-  %splice.nxv4i8 = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> zeroinitializer, i32 -1)
-  %splice.nxv8i8 = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> zeroinitializer, i32 -1)
-  %splice.nxv16i8 = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
-  %splice.nxv32i8 = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
-  %splice.nxv64i8 = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> zeroinitializer, i32 -1)
+  %splice.nxv1i8 = call <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8> zeroinitializer, i32 -1)
+  %splice.nxv2i8 = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x i8> zeroinitializer, i32 -1)
+  %splice.nxv4i8 = call <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x i8> zeroinitializer, i32 -1)
+  %splice.nxv8i8 = call <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x i8> zeroinitializer, i32 -1)
+  %splice.nxv16i8 = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
+  %splice.nxv32i8 = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
+  %splice.nxv64i8 = call <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8> zeroinitializer, <vscale x 64 x i8> zeroinitializer, i32 -1)
 
-  %splice.nxv1i16 = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
-  %splice.nxv2i16 = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
-  %splice.nxv4i16 = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
-  %splice.nxv8i16 = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
-  %splice.nxv16i16 = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
-  %splice.nxv32i16 = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> zeroinitializer, i32 -1)
-  %splice.nxv64i16 = call <vscale x 64 x i16> @llvm.experimental.vector.splice.nxv64i16(<vscale x 64 x i16> zeroinitializer, <vscale x 64 x i16> zeroinitializer, i32 -1)
+  %splice.nxv1i16 = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
+  %splice.nxv2i16 = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> zeroinitializer, <vscale x 2 x i16> zeroinitializer, i32 -1)
+  %splice.nxv4i16 = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x i16> zeroinitializer, i32 -1)
+  %splice.nxv8i16 = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i16> zeroinitializer, i32 -1)
+  %splice.nxv16i16 = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> zeroinitializer, <vscale x 16 x i16> zeroinitializer, i32 -1)
+  %splice.nxv32i16 = call <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16> zeroinitializer, <vscale x 32 x i16> zeroinitializer, i32 -1)
+  %splice.nxv64i16 = call <vscale x 64 x i16> @llvm.vector.splice.nxv64i16(<vscale x 64 x i16> zeroinitializer, <vscale x 64 x i16> zeroinitializer, i32 -1)
 
-  %splice.nxv1i32 = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> zeroinitializer, <vscale x 1 x i32> zeroinitializer, i32 -1)
-  %splice.nxv2i32 = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer, i32 -1)
-  %splice.nxv4i32 = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
-  %splice.nxv8i32 = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
-  %splice.nxv16i32 = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer, i32 -1)
-  %splice.nxv32i32 = call <vscale x 32 x i32> @llvm.experimental.vector.splice.nxv32i32(<vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 -1)
-  %splice.nxv64i32 = call <vscale x 64 x i32> @llvm.experimental.vector.splice.nxv64i32(<vscale x 64 x i32> zeroinitializer, <vscale x 64 x i32> zeroinitializer, i32 -1)
+  %splice.nxv1i32 = call <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32> zeroinitializer, <vscale x 1 x i32> zeroinitializer, i32 -1)
+  %splice.nxv2i32 = call <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer, i32 -1)
+  %splice.nxv4i32 = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer, i32 -1)
+  %splice.nxv8i32 = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> zeroinitializer, <vscale x 8 x i32> zeroinitializer, i32 -1)
+  %splice.nxv16i32 = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer, i32 -1)
+  %splice.nxv32i32 = call <vscale x 32 x i32> @llvm.vector.splice.nxv32i32(<vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 -1)
+  %splice.nxv64i32 = call <vscale x 64 x i32> @llvm.vector.splice.nxv64i32(<vscale x 64 x i32> zeroinitializer, <vscale x 64 x i32> zeroinitializer, i32 -1)
 
-  %splice.nxv1i64 = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
-  %splice.nxv2i64 = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
-  %splice.nxv4i64 = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
-  %splice.nxv8i64 = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> zeroinitializer, i32 -1)
-  %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.experimental.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
-  %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.experimental.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
-  %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.experimental.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
+  %splice.nxv1i64 = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x i64> zeroinitializer, i32 -1)
+  %splice.nxv2i64 = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> zeroinitializer, i32 -1)
+  %splice.nxv4i64 = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i32 -1)
+  %splice.nxv8i64 = call <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64> zeroinitializer, <vscale x 8 x i64> zeroinitializer, i32 -1)
+  %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
+  %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
+  %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
 
-  %splice.nxv1f16 = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
-  %splice.nxv2f16 = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
-  %splice.nxv4f16 = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
-  %splice.nxv8f16 = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
-  %splice.nxv16f16 = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
-  %splice.nxv32f16 = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> zeroinitializer, <vscale x 32 x half> zeroinitializer, i32 -1)
-  %splice.nxv64f16 = call <vscale x 64 x half> @llvm.experimental.vector.splice.nxv64f16(<vscale x 64 x half> zeroinitializer, <vscale x 64 x half> zeroinitializer, i32 -1)
+  %splice.nxv1f16 = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
+  %splice.nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
+  %splice.nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
+  %splice.nxv8f16 = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer, i32 -1)
+  %splice.nxv16f16 = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> zeroinitializer, <vscale x 16 x half> zeroinitializer, i32 -1)
+  %splice.nxv32f16 = call <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half> zeroinitializer, <vscale x 32 x half> zeroinitializer, i32 -1)
+  %splice.nxv64f16 = call <vscale x 64 x half> @llvm.vector.splice.nxv64f16(<vscale x 64 x half> zeroinitializer, <vscale x 64 x half> zeroinitializer, i32 -1)
 
-  %splice.nxv1f32 = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
-  %splice.nxv2f32 = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
-  %splice.nxv4f32 = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
-  %splice.nxv8f32 = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
-  %splice.nxv16f32 = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> zeroinitializer, <vscale x 16 x float> zeroinitializer, i32 -1)
-  %splice.nxv32f32 = call <vscale x 32 x float> @llvm.experimental.vector.splice.nxv32f32(<vscale x 32 x float> zeroinitializer, <vscale x 32 x float> zeroinitializer, i32 -1)
-  %splice.nxv64f32 = call <vscale x 64 x float> @llvm.experimental.vector.splice.nxv64f32(<vscale x 64 x float> zeroinitializer, <vscale x 64 x float> zeroinitializer, i32 -1)
+  %splice.nxv1f32 = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> zeroinitializer, <vscale x 1 x float> zeroinitializer, i32 -1)
+  %splice.nxv2f32 = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x float> zeroinitializer, i32 -1)
+  %splice.nxv4f32 = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i32 -1)
+  %splice.nxv8f32 = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> zeroinitializer, <vscale x 8 x float> zeroinitializer, i32 -1)
+  %splice.nxv16f32 = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> zeroinitializer, <vscale x 16 x float> zeroinitializer, i32 -1)
+  %splice.nxv32f32 = call <vscale x 32 x float> @llvm.vector.splice.nxv32f32(<vscale x 32 x float> zeroinitializer, <vscale x 32 x float> zeroinitializer, i32 -1)
+  %splice.nxv64f32 = call <vscale x 64 x float> @llvm.vector.splice.nxv64f32(<vscale x 64 x float> zeroinitializer, <vscale x 64 x float> zeroinitializer, i32 -1)
 
-  %splice.nxv1f64 = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
-  %splice.nxv2f64 = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
-  %splice.nxv4f64 = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
-  %splice.nxv8f64 = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> zeroinitializer, <vscale x 8 x double> zeroinitializer, i32 -1)
-  %splice.nxv16f64 = call <vscale x 16 x double> @llvm.experimental.vector.splice.nxv16f64(<vscale x 16 x double> zeroinitializer, <vscale x 16 x double> zeroinitializer, i32 -1)
-  %splice.nxv32f64 = call <vscale x 32 x double> @llvm.experimental.vector.splice.nxv32f64(<vscale x 32 x double> zeroinitializer, <vscale x 32 x double> zeroinitializer, i32 -1)
-  %splice.nxv64f64 = call <vscale x 64 x double> @llvm.experimental.vector.splice.nxv64f64(<vscale x 64 x double> zeroinitializer, <vscale x 64 x double> zeroinitializer, i32 -1)
+  %splice.nxv1f64 = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> zeroinitializer, <vscale x 1 x double> zeroinitializer, i32 -1)
+  %splice.nxv2f64 = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer, i32 -1)
+  %splice.nxv4f64 = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> zeroinitializer, <vscale x 4 x double> zeroinitializer, i32 -1)
+  %splice.nxv8f64 = call <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double> zeroinitializer, <vscale x 8 x double> zeroinitializer, i32 -1)
+  %splice.nxv16f64 = call <vscale x 16 x double> @llvm.vector.splice.nxv16f64(<vscale x 16 x double> zeroinitializer, <vscale x 16 x double> zeroinitializer, i32 -1)
+  %splice.nxv32f64 = call <vscale x 32 x double> @llvm.vector.splice.nxv32f64(<vscale x 32 x double> zeroinitializer, <vscale x 32 x double> zeroinitializer, i32 -1)
+  %splice.nxv64f64 = call <vscale x 64 x double> @llvm.vector.splice.nxv64f64(<vscale x 64 x double> zeroinitializer, <vscale x 64 x double> zeroinitializer, i32 -1)
 
   ret void
 }
 
-declare <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, i32)
-declare <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
-declare <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, i32)
-declare <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, i32)
-declare <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
-declare <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
-declare <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, i32)
+declare <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, i32)
+declare <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
+declare <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, i32)
+declare <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, i32)
+declare <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
+declare <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
+declare <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, i32)
 
-declare <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, i32)
-declare <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
-declare <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
-declare <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
-declare <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
-declare <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, i32)
-declare <vscale x 64 x i16> @llvm.experimental.vector.splice.nxv64i16(<vscale x 64 x i16>, <vscale x 64 x i16>, i32)
+declare <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, i32)
+declare <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
+declare <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
+declare <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, i32)
+declare <vscale x 64 x i16> @llvm.vector.splice.nxv64i16(<vscale x 64 x i16>, <vscale x 64 x i16>, i32)
 
-declare <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, i32)
-declare <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, i32)
-declare <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
-declare <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
-declare <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, i32)
-declare <vscale x 32 x i32> @llvm.experimental.vector.splice.nxv32i32(<vscale x 32 x i32>, <vscale x 32 x i32>, i32)
-declare <vscale x 64 x i32> @llvm.experimental.vector.splice.nxv64i32(<vscale x 64 x i32>, <vscale x 64 x i32>, i32)
+declare <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, i32)
+declare <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, i32)
+declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+declare <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
+declare <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, i32)
+declare <vscale x 32 x i32> @llvm.vector.splice.nxv32i32(<vscale x 32 x i32>, <vscale x 32 x i32>, i32)
+declare <vscale x 64 x i32> @llvm.vector.splice.nxv64i32(<vscale x 64 x i32>, <vscale x 64 x i32>, i32)
 
-declare <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, i32)
-declare <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
-declare <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
-declare <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, i32)
-declare <vscale x 16 x i64> @llvm.experimental.vector.splice.nxv16i64(<vscale x 16 x i64>, <vscale x 16 x i64>, i32)
-declare <vscale x 32 x i64> @llvm.experimental.vector.splice.nxv32i64(<vscale x 32 x i64>, <vscale x 32 x i64>, i32)
-declare <vscale x 64 x i64> @llvm.experimental.vector.splice.nxv64i64(<vscale x 64 x i64>, <vscale x 64 x i64>, i32)
+declare <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, i32)
+declare <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
+declare <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
+declare <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, i32)
+declare <vscale x 16 x i64> @llvm.vector.splice.nxv16i64(<vscale x 16 x i64>, <vscale x 16 x i64>, i32)
+declare <vscale x 32 x i64> @llvm.vector.splice.nxv32i64(<vscale x 32 x i64>, <vscale x 32 x i64>, i32)
+declare <vscale x 64 x i64> @llvm.vector.splice.nxv64i64(<vscale x 64 x i64>, <vscale x 64 x i64>, i32)
 
-declare <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, i32)
-declare <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
-declare <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
-declare <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
-declare <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>, i32)
-declare <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half>, <vscale x 32 x half>, i32)
-declare <vscale x 64 x half> @llvm.experimental.vector.splice.nxv64f16(<vscale x 64 x half>, <vscale x 64 x half>, i32)
+declare <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, i32)
+declare <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
+declare <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
+declare <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
+declare <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>, i32)
+declare <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half>, <vscale x 32 x half>, i32)
+declare <vscale x 64 x half> @llvm.vector.splice.nxv64f16(<vscale x 64 x half>, <vscale x 64 x half>, i32)
 
-declare <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float>, <vscale x 1 x float>, i32)
-declare <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
-declare <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
-declare <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>, i32)
-declare <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
-declare <vscale x 32 x float> @llvm.experimental.vector.splice.nxv32f32(<vscale x 32 x float>, <vscale x 32 x float>, i32)
-declare <vscale x 64 x float> @llvm.experimental.vector.splice.nxv64f32(<vscale x 64 x float>, <vscale x 64 x float>, i32)
+declare <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float>, <vscale x 1 x float>, i32)
+declare <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
+declare <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
+declare <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>, i32)
+declare <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
+declare <vscale x 32 x float> @llvm.vector.splice.nxv32f32(<vscale x 32 x float>, <vscale x 32 x float>, i32)
+declare <vscale x 64 x float> @llvm.vector.splice.nxv64f32(<vscale x 64 x float>, <vscale x 64 x float>, i32)
 
-declare <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, i32)
-declare <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
-declare <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>, i32)
-declare <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double>, <vscale x 8 x double>, i32)
-declare <vscale x 16 x double> @llvm.experimental.vector.splice.nxv16f64(<vscale x 16 x double>, <vscale x 16 x double>, i32)
-declare <vscale x 32 x double> @llvm.experimental.vector.splice.nxv32f64(<vscale x 32 x double>, <vscale x 32 x double>, i32)
-declare <vscale x 64 x double> @llvm.experimental.vector.splice.nxv64f64(<vscale x 64 x double>, <vscale x 64 x double>, i32)
+declare <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, i32)
+declare <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
+declare <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>, i32)
+declare <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double>, <vscale x 8 x double>, i32)
+declare <vscale x 16 x double> @llvm.vector.splice.nxv16f64(<vscale x 16 x double>, <vscale x 16 x double>, i32)
+declare <vscale x 32 x double> @llvm.vector.splice.nxv32f64(<vscale x 32 x double>, <vscale x 32 x double>, i32)
+declare <vscale x 64 x double> @llvm.vector.splice.nxv64f64(<vscale x 64 x double>, <vscale x 64 x double>, i32)
diff --git a/llvm/test/Bitcode/upgrade-vector-interleave2-deinterleave2-intrinsics.ll b/llvm/test/Bitcode/upgrade-vector-interleave2-deinterleave2-intrinsics.ll
new file mode 100644
index 000000000000..f06395945297
--- /dev/null
+++ b/llvm/test/Bitcode/upgrade-vector-interleave2-deinterleave2-intrinsics.ll
@@ -0,0 +1,46 @@
+; RUN: opt -S < %s | FileCheck %s
+; RUN: llvm-as %s -o - | llvm-dis | FileCheck %s
+
+define <8 x i32> @interleave_fixed(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @interleave_fixed
+; CHECK: %res = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b)
+
+  %res = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b)
+  ret <8 x i32> %res
+}
+
+define { <4 x i32>, <4 x i32> } @deinterleave_fixed(<8 x i32> %a) {
+; CHECK-LABEL: @deinterleave_fixed
+; CHECK: %res = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %a)
+
+  %res = call { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %a)
+  ret { <4 x i32>, <4 x i32> } %res
+}
+
+define <vscale x 8 x i32> @interleave_scalable(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: @interleave_scalable
+; CHECK: %res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+
+  %res = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  ret <vscale x 8 x i32> %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32> } @deinterleave_scalable(<vscale x 8 x i32> %a) {
+; CHECK-LABEL: @deinterleave_scalable
+; CHECK: %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %a)
+
+  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %a)
+  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
+; CHECK: <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
+
+declare  { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>)
+; CHECK: declare  { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32>)
+
+declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+; CHECK: <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+
+declare  { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+; CHECK: declare  { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
diff --git a/llvm/test/Bitcode/upgrade-vector-reverse-intrinsic.ll b/llvm/test/Bitcode/upgrade-vector-reverse-intrinsic.ll
new file mode 100644
index 000000000000..6b853eaf4175
--- /dev/null
+++ b/llvm/test/Bitcode/upgrade-vector-reverse-intrinsic.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S < %s | FileCheck %s
+; RUN: llvm-as %s -o - | llvm-dis | FileCheck %s
+
+define <16 x i8> @reverse_fixed(<16 x i8> %a) {
+; CHECK-LABEL: @reverse_fixed
+; CHECK: %res = call <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8> %a)
+
+  %res = call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> %a)
+  ret <16 x i8> %res
+}
+
+define <vscale x 16 x i8> @reverse_scalable(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: @reverse_scalable
+; CHECK: %res = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> %a)
+
+  %res = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> %a)
+  ret <vscale x 16 x i8> %res
+}
+
+declare <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8>)
+; CHECK: declare <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8>)
+
+declare <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8>)
+; CHECK: declare <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8>)
diff --git a/llvm/test/Bitcode/upgrade-vector-splice-intrinsic.ll b/llvm/test/Bitcode/upgrade-vector-splice-intrinsic.ll
new file mode 100644
index 000000000000..1b55da21ecd2
--- /dev/null
+++ b/llvm/test/Bitcode/upgrade-vector-splice-intrinsic.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S < %s | FileCheck %s
+; RUN: llvm-as %s -o - | llvm-dis | FileCheck %s
+
+define <8 x half> @splice_fixed(<8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: @splice_fixed
+; CHECK: %res = call <8 x half> @llvm.vector.splice.v8f16(<8 x half> %a, <8 x half> %b, i32 2)
+
+  %res = call <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half> %a, <8 x half> %b, i32 2)
+  ret <8 x half> %res
+}
+
+define <vscale x 8 x half> @splice_scalable(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: @splice_scalable
+; CHECK: %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 2)
+
+  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 2)
+  ret <vscale x 8 x half> %res
+}
+
+declare <8 x half> @llvm.experimental.vector.splice.v8f16(<8 x half>, <8 x half>, i32 immarg)
+; CHECK: declare <8 x half> @llvm.vector.splice.v8f16(<8 x half>, <8 x half>, i32 immarg)
+
+declare <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32 immarg)
+; CHECK: declare <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32 immarg)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-insert-vec-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-insert-vec-elt.mir
index 06fb2ce161c2..0c67a867580c 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-insert-vec-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-insert-vec-elt.mir
@@ -201,3 +201,113 @@ body:             |
     RET_ReallyLR
 
 ...
+---
+name:            test_idx_undef
+body:             |
+  bb.1:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_idx_undef
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: RET_ReallyLR
+    %3:_(s8) = G_CONSTANT i8 127
+    %2:_(<32 x s8>) = G_BUILD_VECTOR %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8)
+    %4:_(s8) = G_CONSTANT i8 -128
+    %5:_(s64) = G_IMPLICIT_DEF
+    %0:_(p0) = COPY $x0
+    %1:_(<32 x s8>) = G_INSERT_VECTOR_ELT %2, %4(s8), %5(s64)
+    G_STORE %1(<32 x s8>), %0(p0) :: (store (<32 x s8>))
+    RET_ReallyLR
+
+...
+---
+name:            test_elt_undef
+body:             |
+  bb.1:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_elt_undef
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 127
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s8>) = G_BUILD_VECTOR [[C]](s8), [[C]](s8), [[C]](s8), [[DEF]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8)
+    ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<32 x s8>), [[COPY]](p0) :: (store (<32 x s8>))
+    ; CHECK-NEXT: RET_ReallyLR
+    %3:_(s8) = G_CONSTANT i8 127
+    %2:_(<32 x s8>) = G_BUILD_VECTOR %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8)
+    %4:_(s8) = G_IMPLICIT_DEF
+    %5:_(s64) = G_CONSTANT i64 3
+    %0:_(p0) = COPY $x0
+    %1:_(<32 x s8>) = G_INSERT_VECTOR_ELT %2, %4(s8), %5(s64)
+    G_STORE %1(<32 x s8>), %0(p0) :: (store (<32 x s8>))
+    RET_ReallyLR
+
+...
+---
+name:            test_elt_undef_with_freeze
+body:             |
+  bb.1:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_elt_undef_with_freeze
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 127
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s8>) = G_BUILD_VECTOR [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<32 x s8>) = G_FREEZE [[BUILD_VECTOR]]
+    ; CHECK-NEXT: G_STORE [[FREEZE]](<32 x s8>), [[COPY]](p0) :: (store (<32 x s8>))
+    ; CHECK-NEXT: RET_ReallyLR
+    %3:_(s8) = G_CONSTANT i8 127
+    %2:_(<32 x s8>) = G_BUILD_VECTOR %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8)
+    %4:_(s8) = G_IMPLICIT_DEF
+    %5:_(s64) = G_CONSTANT i64 3
+    %0:_(p0) = COPY $x0
+    %9:_(<32 x s8>) = G_FREEZE %2
+    %1:_(<32 x s8>) = G_INSERT_VECTOR_ELT %9, %4(s8), %5(s64)
+    G_STORE %1(<32 x s8>), %0(p0) :: (store (<32 x s8>))
+    RET_ReallyLR
+
+...
+---
+name:            test_insert_extract
+body:             |
+  bb.1:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_insert_extract
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 127
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s8>) = G_BUILD_VECTOR [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8)
+    ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<32 x s8>), [[COPY]](p0) :: (store (<32 x s8>))
+    ; CHECK-NEXT: RET_ReallyLR
+    %3:_(s8) = G_CONSTANT i8 127
+    %2:_(<32 x s8>) = G_BUILD_VECTOR %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8)
+    %5:_(s64) = G_CONSTANT i64 3
+    %4:_(s8) = G_EXTRACT_VECTOR_ELT %2, %5
+    %0:_(p0) = COPY $x0
+    %1:_(<32 x s8>) = G_INSERT_VECTOR_ELT %2, %4(s8), %5(s64)
+    G_STORE %1(<32 x s8>), %0(p0) :: (store (<32 x s8>))
+    RET_ReallyLR
+
+...
+---
+name:            test_idx_oob
+body:             |
+  bb.1:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_idx_oob
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: RET_ReallyLR
+    %3:_(s8) = G_CONSTANT i8 127
+    %2:_(<32 x s8>) = G_BUILD_VECTOR %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8)
+    %4:_(s8) = G_CONSTANT i8 -128
+    %5:_(s64) = G_CONSTANT i64 1024
+    %0:_(p0) = COPY $x0
+    %1:_(<32 x s8>) = G_INSERT_VECTOR_ELT %2, %4(s8), %5(s64)
+    G_STORE %1(<32 x s8>), %0(p0) :: (store (<32 x s8>))
+    RET_ReallyLR
+
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-deinterleave2.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-deinterleave2.ll
index 10882a06af1b..0b7fae47a65a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-deinterleave2.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-deinterleave2.ll
@@ -11,7 +11,7 @@ define void @vector_deinterleave2_v4i32(<4 x i32> %a) {
   ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[DEF]], shufflemask(0, 2)
   ; CHECK-NEXT:   [[SHUF1:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[DEF]], shufflemask(1, 3)
   ; CHECK-NEXT:   RET_ReallyLR
-    %res = call {<2 x i32>, <2 x i32>} @llvm.experimental.vector.deinterleave2.v4i32(<4 x i32> %a)
+    %res = call {<2 x i32>, <2 x i32>} @llvm.vector.deinterleave2.v4i32(<4 x i32> %a)
     ret void
 }
 
@@ -29,6 +29,6 @@ define void @vector_deinterleave2_v8f32(<8 x float> %a) {
   ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<8 x s32>), [[DEF]], shufflemask(0, 2, 4, 6)
   ; CHECK-NEXT:   [[SHUF1:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<8 x s32>), [[DEF]], shufflemask(1, 3, 5, 7)
   ; CHECK-NEXT:   RET_ReallyLR
-    %res = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %a)
+    %res = call {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float> %a)
     ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-interleave2.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-interleave2.ll
index f51e47a428d1..0d8ac82c1051 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-interleave2.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-interleave2.ll
@@ -10,7 +10,7 @@ define void @vector_interleave2_v4i32(<2 x i32> %a, <2 x i32> %b) {
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1
   ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s32>), [[COPY1]], shufflemask(0, 2, 1, 3)
   ; CHECK-NEXT:   RET_ReallyLR
-    %res = call <4 x i32> @llvm.experimental.vector.interleave2.v4i32(<2 x i32> %a, <2 x i32> %b)
+    %res = call <4 x i32> @llvm.vector.interleave2.v4i32(<2 x i32> %a, <2 x i32> %b)
     ret void
 }
 
@@ -25,6 +25,6 @@ define void @vector_interleave2_v8f32(<4 x float> %a, <4 x float> %b) {
   ; CHECK-NEXT:   [[BITCAST1:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x s64>)
   ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<8 x s32>) = G_SHUFFLE_VECTOR [[BITCAST]](<4 x s32>), [[BITCAST1]], shufflemask(0, 4, 1, 5, 2, 6, 3, 7)
   ; CHECK-NEXT:   RET_ReallyLR
-    %res = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
+    %res = call <8 x float> @llvm.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
     ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir
new file mode 100644
index 000000000000..e3a633c9e035
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitcast.mir
@@ -0,0 +1,50 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=aarch64 -run-pass=legalizer -global-isel-abort=1  %s -o - | FileCheck %s
+---
+name:            scalar_to_oversize_vector
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: scalar_to_oversize_vector
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   G_BR %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](s64)
+  ; CHECK-NEXT:   [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](s64)
+  ; CHECK-NEXT:   [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](s64)
+  ; CHECK-NEXT:   [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](s64)
+  ; CHECK-NEXT:   [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](s64)
+  ; CHECK-NEXT:   [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](s64)
+  ; CHECK-NEXT:   [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](s64)
+  ; CHECK-NEXT:   [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](s64)
+  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32)
+  ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV4]](s32), [[UV5]](s32), [[UV6]](s32), [[UV7]](s32)
+  ; CHECK-NEXT:   [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV8]](s32), [[UV9]](s32), [[UV10]](s32), [[UV11]](s32)
+  ; CHECK-NEXT:   [[BUILD_VECTOR3:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV12]](s32), [[UV13]](s32), [[UV14]](s32), [[UV15]](s32)
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
+  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR]](<4 x s32>), [[C1]](p0) :: (store (<4 x s32>), align 64)
+  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 16
+  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR1]](<4 x s32>), [[C2]](p0) :: (store (<4 x s32>) into unknown-address + 16)
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(p0) = G_CONSTANT i64 32
+  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR2]](<4 x s32>), [[C3]](p0) :: (store (<4 x s32>) into unknown-address + 32, align 32)
+  ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(p0) = G_CONSTANT i64 48
+  ; CHECK-NEXT:   G_STORE [[BUILD_VECTOR3]](<4 x s32>), [[C4]](p0) :: (store (<4 x s32>) into unknown-address + 48)
+  ; CHECK-NEXT:   G_BR %bb.1
+  bb.1:
+    %0:_(s512) = G_CONSTANT i512 0
+    %2:_(p0) = G_CONSTANT i64 0
+    G_BR %bb.2
+
+  bb.2:
+    %4:_(s512) = G_CONSTANT i512 0
+    %1:_(<16 x s32>) = G_BITCAST %4(s512)
+    %3:_(p0) = G_CONSTANT i64 0
+    G_STORE %1(<16 x s32>), %3(p0) :: (store (<16 x s32>))
+    G_BR %bb.2
+
+...
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
index bb9ba05f7a27..c00c9bfe127e 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
@@ -223,8 +223,8 @@ define i8 @matches_has_sret() nounwind {
 
 %TSRet = type { i64, i64 }
 define void @has_aligned_sret(ptr align 32 sret(%TSRet)) nounwind {
-; CHECK-LABEL:    .def    $ientry_thunk$cdecl$m16a32$v;
-; CHECK:          .section        .wowthk$aa,"xr",discard,$ientry_thunk$cdecl$m16a32$v
+; CHECK-LABEL:    .def    $ientry_thunk$cdecl$m16$v;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$ientry_thunk$cdecl$m16$v
 ; CHECK:          // %bb.0:
 ; CHECK-NEXT:     stp     q6, q7, [sp, #-176]!            // 32-byte Folded Spill
 ; CHECK-NEXT:     .seh_save_any_reg_px    q6, 176
@@ -457,7 +457,7 @@ define %T2 @simple_struct(%T1 %0, %T2 %1, %T3, %T4) nounwind {
 ; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$i8$v
 ; CHECK-NEXT:     .word   1
 ; CHECK-NEXT:     .symidx "#has_aligned_sret"
-; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$m16a32$v
+; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$m16$v
 ; CHECK-NEXT:     .word   1
 ; CHECK-NEXT:     .symidx "#small_array"
 ; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$m2$m2F8
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll
index 3b911e78aff2..7a40fcd85ac5 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll
@@ -236,8 +236,8 @@ declare void @has_sret(ptr sret([100 x i8])) nounwind;
 
 %TSRet = type { i64, i64 }
 declare void @has_aligned_sret(ptr align 32 sret(%TSRet)) nounwind;
-; CHECK-LABEL:    .def    $iexit_thunk$cdecl$m16a32$v;
-; CHECK:          .section        .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$m16a32$v
+; CHECK-LABEL:    .def    $iexit_thunk$cdecl$m16$v;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$m16$v
 ; CHECK:          // %bb.0:
 ; CHECK-NEXT:     sub     sp, sp, #48
 ; CHECK-NEXT:     .seh_stackalloc 48
@@ -271,8 +271,8 @@ declare void @has_aligned_sret(ptr align 32 sret(%TSRet)) nounwind;
 ; CHECK:          adrp    x11, has_aligned_sret
 ; CHECK:          add     x11, x11, :lo12:has_aligned_sret
 ; CHECK:          ldr     x9, [x9, :lo12:__os_arm64x_check_icall]
-; CHECK:          adrp    x10, ($iexit_thunk$cdecl$m16a32$v)
-; CHECK:          add     x10, x10, :lo12:($iexit_thunk$cdecl$m16a32$v)
+; CHECK:          adrp    x10, ($iexit_thunk$cdecl$m16$v)
+; CHECK:          add     x10, x10, :lo12:($iexit_thunk$cdecl$m16$v)
 ; CHECK:          blr     x9
 ; CHECK:          .seh_startepilogue
 ; CHECK:          ldr     x30, [sp], #16                  // 8-byte Folded Reload
@@ -492,7 +492,7 @@ declare %T2 @simple_struct(%T1, %T2, %T3, %T4) nounwind;
 ; CHECK-NEXT:     .symidx has_sret
 ; CHECK-NEXT:     .word   0
 ; CHECK-NEXT:     .symidx has_aligned_sret
-; CHECK-NEXT:     .symidx $iexit_thunk$cdecl$m16a32$v
+; CHECK-NEXT:     .symidx $iexit_thunk$cdecl$m16$v
 ; CHECK-NEXT:     .word   4
 ; CHECK-NEXT:     .symidx "#has_aligned_sret$exit_thunk"
 ; CHECK-NEXT:     .symidx has_aligned_sret
diff --git a/llvm/test/CodeGen/AArch64/combine-mul.ll b/llvm/test/CodeGen/AArch64/combine-mul.ll
index a2b042530809..c49e5ae6620a 100644
--- a/llvm/test/CodeGen/AArch64/combine-mul.ll
+++ b/llvm/test/CodeGen/AArch64/combine-mul.ll
@@ -44,8 +44,7 @@ define <4 x i1> @PR48683_vec_undef(<4 x i32> %x) {
 define i64 @combine_mul_self_demandedbits(i64 %x) {
 ; CHECK-LABEL: combine_mul_self_demandedbits:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul x8, x0, x0
-; CHECK-NEXT:    and x0, x8, #0xfffffffffffffffd
+; CHECK-NEXT:    mul x0, x0, x0
 ; CHECK-NEXT:    ret
   %1 = mul i64 %x, %x
   %2 = and i64 %1, -3
@@ -77,7 +76,7 @@ define i8 @one_demanded_bit(i8 %x) {
 define <2 x i64> @one_demanded_bit_splat(<2 x i64> %x) {
 ; CHECK-LABEL: one_demanded_bit_splat:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32
+; CHECK-NEXT:    mov w8, #32 // =0x20
 ; CHECK-NEXT:    shl v0.2d, v0.2d, #5
 ; CHECK-NEXT:    dup v1.2d, x8
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -131,7 +130,7 @@ define i32 @squared_demanded_2_low_bits(i32 %x) {
 define <2 x i64> @squared_demanded_2_low_bits_splat(<2 x i64> %x) {
 ; CHECK-LABEL: squared_demanded_2_low_bits_splat:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-2
+; CHECK-NEXT:    mov x8, #-2 // =0xfffffffffffffffe
 ; CHECK-NEXT:    dup v1.2d, x8
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll
index 86b1d5d195ff..0485d530fd06 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll
@@ -25,10 +25,10 @@ define <vscale x 4 x double> @mull_add(<vscale x 4 x double> %a, <vscale x 4 x d
 ; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec29 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec29 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec29, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec29, 1
   %4 = fmul contract <vscale x 2 x double> %0, %3
@@ -37,12 +37,12 @@ entry:
   %7 = fmul contract <vscale x 2 x double> %0, %2
   %8 = fmul contract <vscale x 2 x double> %1, %3
   %9 = fsub contract <vscale x 2 x double> %7, %8
-  %strided.vec31 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec31 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec31, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec31, 1
   %12 = fadd contract <vscale x 2 x double> %10, %9
   %13 = fadd contract <vscale x 2 x double> %6, %11
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %12, <vscale x 2 x double> %13)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %12, <vscale x 2 x double> %13)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -67,10 +67,10 @@ define <vscale x 4 x double> @mul_add_mull(<vscale x 4 x double> %a, <vscale x 4
 ; CHECK-NEXT:    fadd z0.d, z25.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 1
   %4 = fmul contract <vscale x 2 x double> %0, %3
@@ -79,10 +79,10 @@ entry:
   %7 = fmul contract <vscale x 2 x double> %0, %2
   %8 = fmul contract <vscale x 2 x double> %1, %3
   %9 = fsub contract <vscale x 2 x double> %7, %8
-  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 1
-  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 0
   %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 1
   %14 = fmul contract <vscale x 2 x double> %10, %13
@@ -93,7 +93,7 @@ entry:
   %19 = fsub contract <vscale x 2 x double> %17, %18
   %20 = fadd contract <vscale x 2 x double> %9, %19
   %21 = fadd contract <vscale x 2 x double> %6, %16
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %20, <vscale x 2 x double> %21)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %20, <vscale x 2 x double> %21)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -118,10 +118,10 @@ define <vscale x 4 x double> @mul_sub_mull(<vscale x 4 x double> %a, <vscale x 4
 ; CHECK-NEXT:    fsub z0.d, z25.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 1
   %4 = fmul contract <vscale x 2 x double> %0, %3
@@ -130,10 +130,10 @@ entry:
   %7 = fmul contract <vscale x 2 x double> %0, %2
   %8 = fmul contract <vscale x 2 x double> %1, %3
   %9 = fsub contract <vscale x 2 x double> %7, %8
-  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 1
-  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 0
   %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 1
   %14 = fmul contract <vscale x 2 x double> %10, %13
@@ -144,7 +144,7 @@ entry:
   %19 = fsub contract <vscale x 2 x double> %17, %18
   %20 = fsub contract <vscale x 2 x double> %9, %19
   %21 = fsub contract <vscale x 2 x double> %6, %16
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %20, <vscale x 2 x double> %21)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %20, <vscale x 2 x double> %21)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -169,10 +169,10 @@ define <vscale x 4 x double> @mul_conj_mull(<vscale x 4 x double> %a, <vscale x
 ; CHECK-NEXT:    fadd z0.d, z25.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec60 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec60 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec60, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec60, 1
   %4 = fmul contract <vscale x 2 x double> %0, %3
@@ -181,10 +181,10 @@ entry:
   %7 = fmul contract <vscale x 2 x double> %0, %2
   %8 = fmul contract <vscale x 2 x double> %1, %3
   %9 = fsub contract <vscale x 2 x double> %7, %8
-  %strided.vec62 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec62 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec62, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec62, 1
-  %strided.vec64 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec64 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec64, 0
   %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec64, 1
   %14 = fmul contract <vscale x 2 x double> %10, %13
@@ -195,7 +195,7 @@ entry:
   %19 = fadd contract <vscale x 2 x double> %17, %18
   %20 = fadd contract <vscale x 2 x double> %9, %19
   %21 = fadd contract <vscale x 2 x double> %6, %16
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %20, <vscale x 2 x double> %21)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %20, <vscale x 2 x double> %21)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -238,10 +238,10 @@ define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale
 ; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec78 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec78 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec78, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec78, 1
   %4 = fmul contract <vscale x 2 x double> %0, %3
@@ -250,14 +250,14 @@ entry:
   %7 = fmul contract <vscale x 2 x double> %0, %2
   %8 = fmul contract <vscale x 2 x double> %1, %3
   %9 = fsub contract <vscale x 2 x double> %7, %8
-  %strided.vec80 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec80 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec80, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec80, 1
   %12 = tail call contract <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> %11)
   %13 = fadd contract <vscale x 2 x double> %10, %12
   %14 = tail call contract <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> %10)
   %15 = fsub contract <vscale x 2 x double> %14, %11
-  %strided.vec82 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec82 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %16 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec82, 0
   %17 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec82, 1
   %18 = fmul contract <vscale x 2 x double> %15, %17
@@ -268,10 +268,10 @@ entry:
   %23 = fsub contract <vscale x 2 x double> %21, %22
   %24 = fadd contract <vscale x 2 x double> %9, %23
   %25 = fadd contract <vscale x 2 x double> %6, %20
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %24, <vscale x 2 x double> %25)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %24, <vscale x 2 x double> %25)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
 declare <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
index edf580e334e8..c643ae9265c0 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
@@ -16,24 +16,24 @@ define <vscale x 4 x double> @mull_add(<vscale x 4 x double> %a, <vscale x 4 x d
 ; CHECK-NEXT:    mov z1.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec29 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec29 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec29, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec29, 1
   %4 = fmul fast <vscale x 2 x double> %3, %0
   %5 = fmul fast <vscale x 2 x double> %2, %1
   %6 = fadd fast <vscale x 2 x double> %4, %5
   %7 = fmul fast <vscale x 2 x double> %2, %0
-  %strided.vec31 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec31 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec31, 0
   %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec31, 1
   %10 = fadd fast <vscale x 2 x double> %8, %7
   %11 = fmul fast <vscale x 2 x double> %3, %1
   %12 = fsub fast <vscale x 2 x double> %10, %11
   %13 = fadd fast <vscale x 2 x double> %6, %9
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %12, <vscale x 2 x double> %13)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %12, <vscale x 2 x double> %13)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -56,20 +56,20 @@ define <vscale x 4 x double> @mul_add_mull(<vscale x 4 x double> %a, <vscale x 4
 ; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec52 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec52, 1
   %4 = fmul fast <vscale x 2 x double> %3, %0
   %5 = fmul fast <vscale x 2 x double> %2, %1
   %6 = fmul fast <vscale x 2 x double> %2, %0
   %7 = fmul fast <vscale x 2 x double> %3, %1
-  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 0
   %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 1
-  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 1
   %12 = fmul fast <vscale x 2 x double> %11, %8
@@ -82,7 +82,7 @@ entry:
   %19 = fadd fast <vscale x 2 x double> %4, %5
   %20 = fadd fast <vscale x 2 x double> %19, %13
   %21 = fadd fast <vscale x 2 x double> %20, %12
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %18, <vscale x 2 x double> %21)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %18, <vscale x 2 x double> %21)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -105,20 +105,20 @@ define <vscale x 4 x double> @mul_sub_mull(<vscale x 4 x double> %a, <vscale x 4
 ; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec54 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec54, 1
   %4 = fmul fast <vscale x 2 x double> %3, %0
   %5 = fmul fast <vscale x 2 x double> %2, %1
   %6 = fmul fast <vscale x 2 x double> %2, %0
   %7 = fmul fast <vscale x 2 x double> %3, %1
-  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec56 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 0
   %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec56, 1
-  %strided.vec58 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec58 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec58, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec58, 1
   %12 = fmul fast <vscale x 2 x double> %11, %9
@@ -131,7 +131,7 @@ entry:
   %19 = fadd fast <vscale x 2 x double> %18, %17
   %20 = fadd fast <vscale x 2 x double> %4, %5
   %21 = fsub fast <vscale x 2 x double> %20, %19
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %16, <vscale x 2 x double> %21)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %16, <vscale x 2 x double> %21)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -154,19 +154,19 @@ define <vscale x 4 x double> @mul_conj_mull(<vscale x 4 x double> %a, <vscale x
 ; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec60 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec60 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec60, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec60, 1
   %4 = fmul fast <vscale x 2 x double> %3, %0
   %5 = fmul fast <vscale x 2 x double> %2, %1
   %6 = fmul fast <vscale x 2 x double> %2, %0
-  %strided.vec62 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec62 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec62, 0
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec62, 1
-  %strided.vec64 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec64 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec64, 0
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec64, 1
   %11 = fmul fast <vscale x 2 x double> %10, %7
@@ -180,7 +180,7 @@ entry:
   %19 = fmul fast <vscale x 2 x double> %9, %8
   %20 = fsub fast <vscale x 2 x double> %18, %19
   %21 = fadd fast <vscale x 2 x double> %20, %11
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %17, <vscale x 2 x double> %21)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %17, <vscale x 2 x double> %21)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -211,20 +211,20 @@ define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale
 ; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec80 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec80 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec80, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec80, 1
   %4 = fmul fast <vscale x 2 x double> %3, %0
   %5 = fmul fast <vscale x 2 x double> %2, %1
   %6 = fmul fast <vscale x 2 x double> %2, %0
   %7 = fmul fast <vscale x 2 x double> %3, %1
-  %strided.vec82 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
+  %strided.vec82 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %c)
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec82, 0
   %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec82, 1
-  %strided.vec84 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
+  %strided.vec84 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %d)
   %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec84, 0
   %11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec84, 1
   %12 = fmul fast <vscale x 2 x double> %10, %8
@@ -237,9 +237,9 @@ entry:
   %19 = fadd fast <vscale x 2 x double> %18, %12
   %20 = fmul fast <vscale x 2 x double> %11, %9
   %21 = fsub fast <vscale x 2 x double> %19, %20
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %17, <vscale x 2 x double> %21)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %17, <vscale x 2 x double> %21)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
index 48b5756b01fb..dae8d9f89e99 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
@@ -24,15 +24,15 @@ define <vscale x 4 x half> @complex_add_v4f16(<vscale x 4 x half> %a, <vscale x
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %a)
   %a.real = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %b)
   %b.real = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 2 x half> %b.real, %a.imag
   %1 = fadd fast <vscale x 2 x half> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1)
+  %interleaved.vec = tail call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1)
   ret <vscale x 4 x half> %interleaved.vec
 }
 
@@ -45,15 +45,15 @@ define <vscale x 8 x half> @complex_add_v8f16(<vscale x 8 x half> %a, <vscale x
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %a)
   %a.real = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %b)
   %b.real = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 4 x half> %b.real, %a.imag
   %1 = fadd fast <vscale x 4 x half> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1)
+  %interleaved.vec = tail call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1)
   ret <vscale x 8 x half> %interleaved.vec
 }
 
@@ -68,15 +68,15 @@ define <vscale x 16 x half> @complex_add_v16f16(<vscale x 16 x half> %a, <vscale
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %a)
   %a.real = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %b)
   %b.real = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 8 x half> %b.real, %a.imag
   %1 = fadd fast <vscale x 8 x half> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1)
+  %interleaved.vec = tail call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1)
   ret <vscale x 16 x half> %interleaved.vec
 }
 
@@ -95,26 +95,26 @@ define <vscale x 32 x half> @complex_add_v32f16(<vscale x 32 x half> %a, <vscale
 ; CHECK-NEXT:    mov z3.d, z7.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %a)
   %a.real = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %b)
   %b.real = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 16 x half> %b.real, %a.imag
   %1 = fadd fast <vscale x 16 x half> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 32 x half> @llvm.experimental.vector.interleave2.nxv32f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1)
+  %interleaved.vec = tail call <vscale x 32 x half> @llvm.vector.interleave2.nxv32f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1)
   ret <vscale x 32 x half> %interleaved.vec
 }
 
-declare { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
-declare <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
+declare { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
+declare <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
 
-declare { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
+declare { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
+declare <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
 
-declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
-declare <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
+declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
+declare <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
 
-declare { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half>)
-declare <vscale x 32 x half> @llvm.experimental.vector.interleave2.nxv32f16(<vscale x 16 x half>, <vscale x 16 x half>)
+declare { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.vector.deinterleave2.nxv32f16(<vscale x 32 x half>)
+declare <vscale x 32 x half> @llvm.vector.interleave2.nxv32f16(<vscale x 16 x half>, <vscale x 16 x half>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
index 7cdb10e7159f..a5c64c0982d0 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
@@ -127,15 +127,15 @@ define <4 x half> @complex_add_v4f16_with_intrinsic(<4 x half> %a, <4 x half> %b
 ; CHECK-NEXT:    fcadd v0.4h, v1.4h, v0.4h, #90
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <2 x half>, <2 x half> } @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %a)
+  %a.deinterleaved = tail call { <2 x half>, <2 x half> } @llvm.vector.deinterleave2.v4f16(<4 x half> %a)
   %a.real = extractvalue { <2 x half>, <2 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <2 x half>, <2 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <2 x half>, <2 x half> } @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %b)
+  %b.deinterleaved = tail call { <2 x half>, <2 x half> } @llvm.vector.deinterleave2.v4f16(<4 x half> %b)
   %b.real = extractvalue { <2 x half>, <2 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <2 x half>, <2 x half> } %b.deinterleaved, 1
   %0 = fsub fast <2 x half> %b.real, %a.imag
   %1 = fadd fast <2 x half> %b.imag, %a.real
-  %interleaved.vec = tail call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %0, <2 x half> %1)
+  %interleaved.vec = tail call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %0, <2 x half> %1)
   ret <4 x half> %interleaved.vec
 }
 
@@ -146,15 +146,15 @@ define <8 x half> @complex_add_v8f16_with_intrinsic(<8 x half> %a, <8 x half> %b
 ; CHECK-NEXT:    fcadd v0.8h, v1.8h, v0.8h, #90
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <4 x half>, <4 x half> } @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %a)
+  %a.deinterleaved = tail call { <4 x half>, <4 x half> } @llvm.vector.deinterleave2.v8f16(<8 x half> %a)
   %a.real = extractvalue { <4 x half>, <4 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <4 x half>, <4 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <4 x half>, <4 x half> } @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %b)
+  %b.deinterleaved = tail call { <4 x half>, <4 x half> } @llvm.vector.deinterleave2.v8f16(<8 x half> %b)
   %b.real = extractvalue { <4 x half>, <4 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <4 x half>, <4 x half> } %b.deinterleaved, 1
   %0 = fsub fast <4 x half> %b.real, %a.imag
   %1 = fadd fast <4 x half> %b.imag, %a.real
-  %interleaved.vec = tail call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %0, <4 x half> %1)
+  %interleaved.vec = tail call <8 x half> @llvm.vector.interleave2.v8f16(<4 x half> %0, <4 x half> %1)
   ret <8 x half> %interleaved.vec
 }
 
@@ -166,15 +166,15 @@ define <16 x half> @complex_add_v16f16_with_intrinsic(<16 x half> %a, <16 x half
 ; CHECK-NEXT:    fcadd v0.8h, v2.8h, v0.8h, #90
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <8 x half>, <8 x half> } @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %a)
+  %a.deinterleaved = tail call { <8 x half>, <8 x half> } @llvm.vector.deinterleave2.v16f16(<16 x half> %a)
   %a.real = extractvalue { <8 x half>, <8 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <8 x half>, <8 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <8 x half>, <8 x half> } @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %b)
+  %b.deinterleaved = tail call { <8 x half>, <8 x half> } @llvm.vector.deinterleave2.v16f16(<16 x half> %b)
   %b.real = extractvalue { <8 x half>, <8 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <8 x half>, <8 x half> } %b.deinterleaved, 1
   %0 = fsub fast <8 x half> %b.real, %a.imag
   %1 = fadd fast <8 x half> %b.imag, %a.real
-  %interleaved.vec = tail call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %0, <8 x half> %1)
+  %interleaved.vec = tail call <16 x half> @llvm.vector.interleave2.v16f16(<8 x half> %0, <8 x half> %1)
   ret <16 x half> %interleaved.vec
 }
 
@@ -216,11 +216,11 @@ entry:
 }
 
 
-declare { <2 x half>, <2 x half> } @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>)
-declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>)
+declare { <2 x half>, <2 x half> } @llvm.vector.deinterleave2.v4f16(<4 x half>)
+declare <4 x half> @llvm.vector.interleave2.v4f16(<2 x half>, <2 x half>)
 
-declare { <4 x half>, <4 x half> } @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>)
-declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>)
+declare { <4 x half>, <4 x half> } @llvm.vector.deinterleave2.v8f16(<8 x half>)
+declare <8 x half> @llvm.vector.interleave2.v8f16(<4 x half>, <4 x half>)
 
-declare { <8 x half>, <8 x half> } @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>)
-declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>)
+declare { <8 x half>, <8 x half> } @llvm.vector.deinterleave2.v16f16(<16 x half>)
+declare <16 x half> @llvm.vector.interleave2.v16f16(<8 x half>, <8 x half>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
index cb285c05b2e8..c09ec616b015 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
@@ -27,10 +27,10 @@ define <vscale x 4 x half> @complex_mul_v4f16(<vscale x 4 x half> %a, <vscale x
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %a)
   %a.real = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %b)
   %b.real = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 2 x half> %b.imag, %a.real
@@ -39,7 +39,7 @@ entry:
   %3 = fmul fast <vscale x 2 x half> %b.real, %a.real
   %4 = fmul fast <vscale x 2 x half> %a.imag, %b.imag
   %5 = fsub fast <vscale x 2 x half> %3, %4
-  %interleaved.vec = tail call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %5, <vscale x 2 x half> %2)
+  %interleaved.vec = tail call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %5, <vscale x 2 x half> %2)
   ret <vscale x 4 x half> %interleaved.vec
 }
 
@@ -54,10 +54,10 @@ define <vscale x 8 x half> @complex_mul_v8f16(<vscale x 8 x half> %a, <vscale x
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %a)
   %a.real = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %b)
   %b.real = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 4 x half> %b.imag, %a.real
@@ -66,7 +66,7 @@ entry:
   %3 = fmul fast <vscale x 4 x half> %b.real, %a.real
   %4 = fmul fast <vscale x 4 x half> %a.imag, %b.imag
   %5 = fsub fast <vscale x 4 x half> %3, %4
-  %interleaved.vec = tail call <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half> %5, <vscale x 4 x half> %2)
+  %interleaved.vec = tail call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %5, <vscale x 4 x half> %2)
   ret <vscale x 8 x half> %interleaved.vec
 }
 ; Expected to transform
@@ -84,10 +84,10 @@ define <vscale x 16 x half> @complex_mul_v16f16(<vscale x 16 x half> %a, <vscale
 ; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %a)
   %a.real = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %b)
   %b.real = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 8 x half> %b.imag, %a.real
@@ -96,7 +96,7 @@ entry:
   %3 = fmul fast <vscale x 8 x half> %b.real, %a.real
   %4 = fmul fast <vscale x 8 x half> %a.imag, %b.imag
   %5 = fsub fast <vscale x 8 x half> %3, %4
-  %interleaved.vec = tail call <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half> %5, <vscale x 8 x half> %2)
+  %interleaved.vec = tail call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %5, <vscale x 8 x half> %2)
   ret <vscale x 16 x half> %interleaved.vec
 }
 
@@ -123,10 +123,10 @@ define <vscale x 32 x half> @complex_mul_v32f16(<vscale x 32 x half> %a, <vscale
 ; CHECK-NEXT:    mov z2.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %a)
+  %a.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %a)
   %a.real = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %b)
+  %b.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %b)
   %b.real = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 16 x half> %b.imag, %a.real
@@ -135,20 +135,20 @@ entry:
   %3 = fmul fast <vscale x 16 x half> %b.real, %a.real
   %4 = fmul fast <vscale x 16 x half> %a.imag, %b.imag
   %5 = fsub fast <vscale x 16 x half> %3, %4
-  %interleaved.vec = tail call <vscale x 32 x half> @llvm.experimental.vector.interleave2.nxv32f16(<vscale x 16 x half> %5, <vscale x 16 x half> %2)
+  %interleaved.vec = tail call <vscale x 32 x half> @llvm.vector.interleave2.nxv32f16(<vscale x 16 x half> %5, <vscale x 16 x half> %2)
   ret <vscale x 32 x half> %interleaved.vec
 }
 
-declare { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
-declare <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
+declare { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
+declare <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
 
-declare { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
+declare { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
+declare <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
 
-declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
-declare <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
+declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
+declare <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
 
-declare { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half>)
-declare <vscale x 32 x half> @llvm.experimental.vector.interleave2.nxv32f16(<vscale x 16 x half>, <vscale x 16 x half>)
+declare { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.vector.deinterleave2.nxv32f16(<vscale x 32 x half>)
+declare <vscale x 32 x half> @llvm.vector.interleave2.nxv32f16(<vscale x 16 x half>, <vscale x 16 x half>)
 
 
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll
index ab764a58a770..47ad9ea2451a 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll
@@ -12,15 +12,15 @@ define <vscale x 4 x float> @complex_add_v4f32(<vscale x 4 x float> %a, <vscale
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %a)
   %a.real = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %b)
   %b.real = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 2 x float> %b.real, %a.imag
   %1 = fadd fast <vscale x 2 x float> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1)
+  %interleaved.vec = tail call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1)
   ret <vscale x 4 x float> %interleaved.vec
 }
 
@@ -35,15 +35,15 @@ define <vscale x 8 x float> @complex_add_v8f32(<vscale x 8 x float> %a, <vscale
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %a)
   %a.real = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %b)
   %b.real = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 4 x float> %b.real, %a.imag
   %1 = fadd fast <vscale x 4 x float> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1)
+  %interleaved.vec = tail call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1)
   ret <vscale x 8 x float> %interleaved.vec
 }
 ; Expected to transform
@@ -61,23 +61,23 @@ define <vscale x 16 x float> @complex_add_v16f32(<vscale x 16 x float> %a, <vsca
 ; CHECK-NEXT:    mov z3.d, z7.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %a)
   %a.real = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %b)
   %b.real = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 8 x float> %b.real, %a.imag
   %1 = fadd fast <vscale x 8 x float> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 16 x float> @llvm.experimental.vector.interleave2.nxv16f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1)
+  %interleaved.vec = tail call <vscale x 16 x float> @llvm.vector.interleave2.nxv16f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1)
   ret <vscale x 16 x float> %interleaved.vec
 }
 
-declare { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
-declare <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
+declare { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
 
-declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
-declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
+declare <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
 
-declare { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float>)
-declare <vscale x 16 x float> @llvm.experimental.vector.interleave2.nxv16f32(<vscale x 8 x float>, <vscale x 8 x float>)
+declare { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.vector.deinterleave2.nxv16f32(<vscale x 16 x float>)
+declare <vscale x 16 x float> @llvm.vector.interleave2.nxv16f32(<vscale x 8 x float>, <vscale x 8 x float>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll
index 1e2afb78de1b..bcd46aa182b5 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll
@@ -14,10 +14,10 @@ define <vscale x 4 x float> @complex_mul_v4f32(<vscale x 4 x float> %a, <vscale
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %a)
   %a.real = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %b)
   %b.real = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 2 x float> %b.imag, %a.real
@@ -26,7 +26,7 @@ entry:
   %3 = fmul fast <vscale x 2 x float> %b.real, %a.real
   %4 = fmul fast <vscale x 2 x float> %a.imag, %b.imag
   %5 = fsub fast <vscale x 2 x float> %3, %4
-  %interleaved.vec = tail call <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float> %5, <vscale x 2 x float> %2)
+  %interleaved.vec = tail call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %5, <vscale x 2 x float> %2)
   ret <vscale x 4 x float> %interleaved.vec
 }
 
@@ -45,10 +45,10 @@ define <vscale x 8 x float> @complex_mul_v8f32(<vscale x 8 x float> %a, <vscale
 ; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %a)
   %a.real = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %b)
   %b.real = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 4 x float> %b.imag, %a.real
@@ -57,7 +57,7 @@ entry:
   %3 = fmul fast <vscale x 4 x float> %b.real, %a.real
   %4 = fmul fast <vscale x 4 x float> %a.imag, %b.imag
   %5 = fsub fast <vscale x 4 x float> %3, %4
-  %interleaved.vec = tail call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %5, <vscale x 4 x float> %2)
+  %interleaved.vec = tail call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %5, <vscale x 4 x float> %2)
   ret <vscale x 8 x float> %interleaved.vec
 }
 
@@ -84,10 +84,10 @@ define <vscale x 16 x float> @complex_mul_v16f32(<vscale x 16 x float> %a, <vsca
 ; CHECK-NEXT:    mov z2.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %a)
   %a.real = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %b)
   %b.real = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 8 x float> %b.imag, %a.real
@@ -96,16 +96,16 @@ entry:
   %3 = fmul fast <vscale x 8 x float> %b.real, %a.real
   %4 = fmul fast <vscale x 8 x float> %a.imag, %b.imag
   %5 = fsub fast <vscale x 8 x float> %3, %4
-  %interleaved.vec = tail call <vscale x 16 x float> @llvm.experimental.vector.interleave2.nxv16f32(<vscale x 8 x float> %5, <vscale x 8 x float> %2)
+  %interleaved.vec = tail call <vscale x 16 x float> @llvm.vector.interleave2.nxv16f32(<vscale x 8 x float> %5, <vscale x 8 x float> %2)
   ret <vscale x 16 x float> %interleaved.vec
 }
 
-declare { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
-declare <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
+declare { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
 
-declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
-declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
+declare <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
 
-declare { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float>)
-declare <vscale x 16 x float> @llvm.experimental.vector.interleave2.nxv16f32(<vscale x 8 x float>, <vscale x 8 x float>)
+declare { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.vector.deinterleave2.nxv16f32(<vscale x 16 x float>)
+declare <vscale x 16 x float> @llvm.vector.interleave2.nxv16f32(<vscale x 8 x float>, <vscale x 8 x float>)
 
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll
index 46a15f489d2b..c992d63ca283 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll
@@ -12,15 +12,15 @@ define <vscale x 2 x double> @complex_add_v2f64(<vscale x 2 x double> %a, <vscal
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %a)
+  %a.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %a)
   %a.real = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %b)
+  %b.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %b)
   %b.real = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 1 x double> %b.real, %a.imag
   %1 = fadd fast <vscale x 1 x double> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 2 x double> @llvm.experimental.vector.interleave2.nxv2f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1)
+  %interleaved.vec = tail call <vscale x 2 x double> @llvm.vector.interleave2.nxv2f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1)
   ret <vscale x 2 x double> %interleaved.vec
 }
 
@@ -35,15 +35,15 @@ define <vscale x 4 x double> @complex_add_v4f64(<vscale x 4 x double> %a, <vscal
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %a.real = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %b.real = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 2 x double> %b.real, %a.imag
   %1 = fadd fast <vscale x 2 x double> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -62,23 +62,23 @@ define <vscale x 8 x double> @complex_add_v8f64(<vscale x 8 x double> %a, <vscal
 ; CHECK-NEXT:    mov z3.d, z7.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %a)
   %a.real = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %b)
   %b.real = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %b.deinterleaved, 1
   %0 = fsub fast <vscale x 4 x double> %b.real, %a.imag
   %1 = fadd fast <vscale x 4 x double> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1)
+  %interleaved.vec = tail call <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1)
   ret <vscale x 8 x double> %interleaved.vec
 }
 
-declare { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double>)
-declare <vscale x 2 x double> @llvm.experimental.vector.interleave2.nxv2f64(<vscale x 1 x double>, <vscale x 1 x double>)
+declare { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.vector.deinterleave2.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.vector.interleave2.nxv2f64(<vscale x 1 x double>, <vscale x 1 x double>)
 
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
 
-declare { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double>)
-declare <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
+declare { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double>)
+declare <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll
index 17a239a09a03..db28fa3997cb 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll
@@ -14,10 +14,10 @@ define <vscale x 2 x double> @complex_mul_v2f64(<vscale x 2 x double> %a, <vscal
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %a)
+  %a.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %a)
   %a.real = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %b)
+  %b.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %b)
   %b.real = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 1 x double> %b.imag, %a.real
@@ -26,7 +26,7 @@ entry:
   %3 = fmul fast <vscale x 1 x double> %b.real, %a.real
   %4 = fmul fast <vscale x 1 x double> %a.imag, %b.imag
   %5 = fsub fast <vscale x 1 x double> %3, %4
-  %interleaved.vec = tail call <vscale x 2 x double> @llvm.experimental.vector.interleave2.nxv2f64(<vscale x 1 x double> %5, <vscale x 1 x double> %2)
+  %interleaved.vec = tail call <vscale x 2 x double> @llvm.vector.interleave2.nxv2f64(<vscale x 1 x double> %5, <vscale x 1 x double> %2)
   ret <vscale x 2 x double> %interleaved.vec
 }
 
@@ -45,10 +45,10 @@ define <vscale x 4 x double> @complex_mul_v4f64(<vscale x 4 x double> %a, <vscal
 ; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %a.real = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %b.real = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 2 x double> %b.imag, %a.real
@@ -57,7 +57,7 @@ entry:
   %3 = fmul fast <vscale x 2 x double> %b.real, %a.real
   %4 = fmul fast <vscale x 2 x double> %a.imag, %b.imag
   %5 = fsub fast <vscale x 2 x double> %3, %4
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %5, <vscale x 2 x double> %2)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %5, <vscale x 2 x double> %2)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -84,10 +84,10 @@ define <vscale x 8 x double> @complex_mul_v8f64(<vscale x 8 x double> %a, <vscal
 ; CHECK-NEXT:    mov z2.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %a)
   %a.real = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %b)
   %b.real = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %b.deinterleaved, 1
   %0 = fmul fast <vscale x 4 x double> %b.imag, %a.real
@@ -96,15 +96,15 @@ entry:
   %3 = fmul fast <vscale x 4 x double> %b.real, %a.real
   %4 = fmul fast <vscale x 4 x double> %a.imag, %b.imag
   %5 = fsub fast <vscale x 4 x double> %3, %4
-  %interleaved.vec = tail call <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double> %5, <vscale x 4 x double> %2)
+  %interleaved.vec = tail call <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double> %5, <vscale x 4 x double> %2)
   ret <vscale x 8 x double> %interleaved.vec
 }
 
-declare { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double>)
-declare <vscale x 2 x double> @llvm.experimental.vector.interleave2.nxv2f64(<vscale x 1 x double>, <vscale x 1 x double>)
+declare { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.vector.deinterleave2.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.vector.interleave2.nxv2f64(<vscale x 1 x double>, <vscale x 1 x double>)
 
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
 
-declare { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double>)
-declare <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
+declare { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double>)
+declare <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll
index 001046f8f397..f0569674c651 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll
@@ -22,15 +22,15 @@ define <vscale x 4 x i16> @complex_add_v4i16(<vscale x 4 x i16> %a, <vscale x 4
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %a)
   %a.real = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %b)
   %b.real = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %b.deinterleaved, 1
   %0 = sub <vscale x 2 x i16> %b.real, %a.imag
   %1 = add <vscale x 2 x i16> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 4 x i16> @llvm.experimental.vector.interleave2.nxv4i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1)
+  %interleaved.vec = tail call <vscale x 4 x i16> @llvm.vector.interleave2.nxv4i16(<vscale x 2 x i16> %0, <vscale x 2 x i16> %1)
   ret <vscale x 4 x i16> %interleaved.vec
 }
 
@@ -42,15 +42,15 @@ define <vscale x 8 x i16> @complex_add_v8i16(<vscale x 8 x i16> %a, <vscale x 8
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %a)
   %a.real = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %b)
   %b.real = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %b.deinterleaved, 1
   %0 = sub <vscale x 4 x i16> %b.real, %a.imag
   %1 = add <vscale x 4 x i16> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1)
+  %interleaved.vec = tail call <vscale x 8 x i16> @llvm.vector.interleave2.nxv8i16(<vscale x 4 x i16> %0, <vscale x 4 x i16> %1)
   ret <vscale x 8 x i16> %interleaved.vec
 }
 
@@ -64,15 +64,15 @@ define <vscale x 16 x i16> @complex_add_v16i16(<vscale x 16 x i16> %a, <vscale x
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %a)
   %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %b)
   %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
   %0 = sub <vscale x 8 x i16> %b.real, %a.imag
   %1 = add <vscale x 8 x i16> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1)
+  %interleaved.vec = tail call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1)
   ret <vscale x 16 x i16> %interleaved.vec
 }
 
@@ -90,26 +90,26 @@ define <vscale x 32 x i16> @complex_add_v32i16(<vscale x 32 x i16> %a, <vscale x
 ; CHECK-NEXT:    mov z3.d, z7.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %a)
   %a.real = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %b)
   %b.real = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %b.deinterleaved, 1
   %0 = sub <vscale x 16 x i16> %b.real, %a.imag
   %1 = add <vscale x 16 x i16> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 32 x i16> @llvm.experimental.vector.interleave2.nxv32i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1)
+  %interleaved.vec = tail call <vscale x 32 x i16> @llvm.vector.interleave2.nxv32i16(<vscale x 16 x i16> %0, <vscale x 16 x i16> %1)
   ret <vscale x 32 x i16> %interleaved.vec
 }
 
-declare { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16>)
-declare <vscale x 4 x i16> @llvm.experimental.vector.interleave2.nxv4i16(<vscale x 2 x i16>, <vscale x 2 x i16>)
+declare { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.vector.deinterleave2.nxv4i16(<vscale x 4 x i16>)
+declare <vscale x 4 x i16> @llvm.vector.interleave2.nxv4i16(<vscale x 2 x i16>, <vscale x 2 x i16>)
 
-declare { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16>, <vscale x 4 x i16>)
+declare { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 8 x i16> @llvm.vector.interleave2.nxv8i16(<vscale x 4 x i16>, <vscale x 4 x i16>)
 
-declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
+declare <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
 
-declare { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16>)
-declare <vscale x 32 x i16> @llvm.experimental.vector.interleave2.nxv32i16(<vscale x 16 x i16>, <vscale x 16 x i16>)
+declare { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.vector.deinterleave2.nxv32i16(<vscale x 32 x i16>)
+declare <vscale x 32 x i16> @llvm.vector.interleave2.nxv32i16(<vscale x 16 x i16>, <vscale x 16 x i16>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
index 07488b623b98..b4cb548f6308 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
@@ -26,10 +26,10 @@ define <vscale x 4 x i16> @complex_mul_v4i16(<vscale x 4 x i16> %a, <vscale x 4
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %a)
   %a.real = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.vector.deinterleave2.nxv4i16(<vscale x 4 x i16> %b)
   %b.real = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } %b.deinterleaved, 1
   %0 = mul <vscale x 2 x i16> %b.imag, %a.real
@@ -38,7 +38,7 @@ entry:
   %3 = mul <vscale x 2 x i16> %b.real, %a.real
   %4 = mul <vscale x 2 x i16> %a.imag, %b.imag
   %5 = sub <vscale x 2 x i16> %3, %4
-  %interleaved.vec = tail call <vscale x 4 x i16> @llvm.experimental.vector.interleave2.nxv4i16(<vscale x 2 x i16> %5, <vscale x 2 x i16> %2)
+  %interleaved.vec = tail call <vscale x 4 x i16> @llvm.vector.interleave2.nxv4i16(<vscale x 2 x i16> %5, <vscale x 2 x i16> %2)
   ret <vscale x 4 x i16> %interleaved.vec
 }
 
@@ -52,10 +52,10 @@ define <vscale x 8 x i16> @complex_mul_v8i16(<vscale x 8 x i16> %a, <vscale x 8
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %a)
   %a.real = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %b)
   %b.real = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } %b.deinterleaved, 1
   %0 = mul <vscale x 4 x i16> %b.imag, %a.real
@@ -64,7 +64,7 @@ entry:
   %3 = mul <vscale x 4 x i16> %b.real, %a.real
   %4 = mul <vscale x 4 x i16> %a.imag, %b.imag
   %5 = sub <vscale x 4 x i16> %3, %4
-  %interleaved.vec = tail call <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16> %5, <vscale x 4 x i16> %2)
+  %interleaved.vec = tail call <vscale x 8 x i16> @llvm.vector.interleave2.nxv8i16(<vscale x 4 x i16> %5, <vscale x 4 x i16> %2)
   ret <vscale x 8 x i16> %interleaved.vec
 }
 ; Expected to transform
@@ -81,10 +81,10 @@ define <vscale x 16 x i16> @complex_mul_v16i16(<vscale x 16 x i16> %a, <vscale x
 ; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %a)
   %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %b)
   %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
   %0 = mul <vscale x 8 x i16> %b.imag, %a.real
@@ -93,7 +93,7 @@ entry:
   %3 = mul <vscale x 8 x i16> %b.real, %a.real
   %4 = mul <vscale x 8 x i16> %a.imag, %b.imag
   %5 = sub <vscale x 8 x i16> %3, %4
-  %interleaved.vec = tail call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %5, <vscale x 8 x i16> %2)
+  %interleaved.vec = tail call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %5, <vscale x 8 x i16> %2)
   ret <vscale x 16 x i16> %interleaved.vec
 }
 
@@ -119,10 +119,10 @@ define <vscale x 32 x i16> @complex_mul_v32i16(<vscale x 32 x i16> %a, <vscale x
 ; CHECK-NEXT:    mov z2.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %a)
+  %a.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %a)
   %a.real = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %b)
+  %b.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %b)
   %b.real = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %b.deinterleaved, 1
   %0 = mul <vscale x 16 x i16> %b.imag, %a.real
@@ -131,20 +131,20 @@ entry:
   %3 = mul <vscale x 16 x i16> %b.real, %a.real
   %4 = mul <vscale x 16 x i16> %a.imag, %b.imag
   %5 = sub <vscale x 16 x i16> %3, %4
-  %interleaved.vec = tail call <vscale x 32 x i16> @llvm.experimental.vector.interleave2.nxv32i16(<vscale x 16 x i16> %5, <vscale x 16 x i16> %2)
+  %interleaved.vec = tail call <vscale x 32 x i16> @llvm.vector.interleave2.nxv32i16(<vscale x 16 x i16> %5, <vscale x 16 x i16> %2)
   ret <vscale x 32 x i16> %interleaved.vec
 }
 
-declare { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.experimental.vector.deinterleave2.nxv4i16(<vscale x 4 x i16>)
-declare <vscale x 4 x i16> @llvm.experimental.vector.interleave2.nxv4i16(<vscale x 2 x i16>, <vscale x 2 x i16>)
+declare { <vscale x 2 x i16>, <vscale x 2 x i16> } @llvm.vector.deinterleave2.nxv4i16(<vscale x 4 x i16>)
+declare <vscale x 4 x i16> @llvm.vector.interleave2.nxv4i16(<vscale x 2 x i16>, <vscale x 2 x i16>)
 
-declare { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16>, <vscale x 4 x i16>)
+declare { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 8 x i16> @llvm.vector.interleave2.nxv8i16(<vscale x 4 x i16>, <vscale x 4 x i16>)
 
-declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
+declare <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
 
-declare { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16>)
-declare <vscale x 32 x i16> @llvm.experimental.vector.interleave2.nxv32i16(<vscale x 16 x i16>, <vscale x 16 x i16>)
+declare { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.vector.deinterleave2.nxv32i16(<vscale x 32 x i16>)
+declare <vscale x 32 x i16> @llvm.vector.interleave2.nxv32i16(<vscale x 16 x i16>, <vscale x 16 x i16>)
 
 
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll
index 1ce480bbf3d8..458cd62269f8 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll
@@ -11,15 +11,15 @@ define <vscale x 4 x i32> @complex_add_v4i32(<vscale x 4 x i32> %a, <vscale x 4
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %a)
   %a.real = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %b)
   %b.real = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %b.deinterleaved, 1
   %0 = sub <vscale x 2 x i32> %b.real, %a.imag
   %1 = add <vscale x 2 x i32> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1)
+  %interleaved.vec = tail call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %0, <vscale x 2 x i32> %1)
   ret <vscale x 4 x i32> %interleaved.vec
 }
 
@@ -33,15 +33,15 @@ define <vscale x 8 x i32> @complex_add_v8i32(<vscale x 8 x i32> %a, <vscale x 8
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %a)
   %a.real = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %b)
   %b.real = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %b.deinterleaved, 1
   %0 = sub <vscale x 4 x i32> %b.real, %a.imag
   %1 = add <vscale x 4 x i32> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1)
+  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1)
   ret <vscale x 8 x i32> %interleaved.vec
 }
 
@@ -59,23 +59,23 @@ define <vscale x 16 x i32> @complex_add_v16i32(<vscale x 16 x i32> %a, <vscale x
 ; CHECK-NEXT:    mov z3.d, z7.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %a)
   %a.real = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %b)
   %b.real = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %b.deinterleaved, 1
   %0 = sub <vscale x 8 x i32> %b.real, %a.imag
   %1 = add <vscale x 8 x i32> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1)
+  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %0, <vscale x 8 x i32> %1)
   ret <vscale x 16 x i32> %interleaved.vec
 }
 
-declare { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32>, <vscale x 2 x i32>)
+declare { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32>, <vscale x 2 x i32>)
 
-declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
-declare { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32>)
-declare <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32>, <vscale x 8 x i32>)
+declare { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32>)
+declare <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32>, <vscale x 8 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll
index d88eef9800d7..4cfe4707b9a9 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll
@@ -13,10 +13,10 @@ define <vscale x 4 x i32> @complex_mul_v4i32(<vscale x 4 x i32> %a, <vscale x 4
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %a)
   %a.real = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %b)
   %b.real = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %b.deinterleaved, 1
   %0 = mul <vscale x 2 x i32> %b.imag, %a.real
@@ -25,7 +25,7 @@ entry:
   %3 = mul <vscale x 2 x i32> %b.real, %a.real
   %4 = mul <vscale x 2 x i32> %a.imag, %b.imag
   %5 = sub <vscale x 2 x i32> %3, %4
-  %interleaved.vec = tail call <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32> %5, <vscale x 2 x i32> %2)
+  %interleaved.vec = tail call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %5, <vscale x 2 x i32> %2)
   ret <vscale x 4 x i32> %interleaved.vec
 }
 
@@ -43,10 +43,10 @@ define <vscale x 8 x i32> @complex_mul_v8i32(<vscale x 8 x i32> %a, <vscale x 8
 ; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %a)
   %a.real = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %b)
   %b.real = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %b.deinterleaved, 1
   %0 = mul <vscale x 4 x i32> %b.imag, %a.real
@@ -55,7 +55,7 @@ entry:
   %3 = mul <vscale x 4 x i32> %b.real, %a.real
   %4 = mul <vscale x 4 x i32> %a.imag, %b.imag
   %5 = sub <vscale x 4 x i32> %3, %4
-  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %5, <vscale x 4 x i32> %2)
+  %interleaved.vec = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %5, <vscale x 4 x i32> %2)
   ret <vscale x 8 x i32> %interleaved.vec
 }
 
@@ -81,10 +81,10 @@ define <vscale x 16 x i32> @complex_mul_v16i32(<vscale x 16 x i32> %a, <vscale x
 ; CHECK-NEXT:    mov z2.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %a)
   %a.real = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %b)
   %b.real = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %b.deinterleaved, 1
   %0 = mul <vscale x 8 x i32> %b.imag, %a.real
@@ -93,16 +93,16 @@ entry:
   %3 = mul <vscale x 8 x i32> %b.real, %a.real
   %4 = mul <vscale x 8 x i32> %a.imag, %b.imag
   %5 = sub <vscale x 8 x i32> %3, %4
-  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %5, <vscale x 8 x i32> %2)
+  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %5, <vscale x 8 x i32> %2)
   ret <vscale x 16 x i32> %interleaved.vec
 }
 
-declare { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32>, <vscale x 2 x i32>)
+declare { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32>, <vscale x 2 x i32>)
 
-declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
 
-declare { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32>)
-declare <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32>, <vscale x 8 x i32>)
+declare { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32>)
+declare <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32>, <vscale x 8 x i32>)
 
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll
index 0b59be9414fa..f06b55c68b7e 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll
@@ -11,15 +11,15 @@ define <vscale x 2 x i64> @complex_add_v2i64(<vscale x 2 x i64> %a, <vscale x 2
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %a)
+  %a.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %a)
   %a.real = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %b)
+  %b.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %b)
   %b.real = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %b.deinterleaved, 1
   %0 = sub <vscale x 1 x i64> %b.real, %a.imag
   %1 = add <vscale x 1 x i64> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 2 x i64> @llvm.experimental.vector.interleave2.nxv2i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1)
+  %interleaved.vec = tail call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1)
   ret <vscale x 2 x i64> %interleaved.vec
 }
 
@@ -33,15 +33,15 @@ define <vscale x 4 x i64> @complex_add_v4i64(<vscale x 4 x i64> %a, <vscale x 4
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %a)
   %a.real = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %b)
   %b.real = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %b.deinterleaved, 1
   %0 = sub <vscale x 2 x i64> %b.real, %a.imag
   %1 = add <vscale x 2 x i64> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1)
+  %interleaved.vec = tail call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1)
   ret <vscale x 4 x i64> %interleaved.vec
 }
 
@@ -59,23 +59,23 @@ define <vscale x 8 x i64> @complex_add_v8i64(<vscale x 8 x i64> %a, <vscale x 8
 ; CHECK-NEXT:    mov z3.d, z7.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
   %a.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %b)
   %b.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 1
   %0 = sub <vscale x 4 x i64> %b.real, %a.imag
   %1 = add <vscale x 4 x i64> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1)
+  %interleaved.vec = tail call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> %0, <vscale x 4 x i64> %1)
   ret <vscale x 8 x i64> %interleaved.vec
 }
 
-declare { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64>)
-declare <vscale x 2 x i64> @llvm.experimental.vector.interleave2.nxv2i64(<vscale x 1 x i64>, <vscale x 1 x i64>)
+declare { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64>, <vscale x 1 x i64>)
 
-declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
+declare <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
 
-declare { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
-declare <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
+declare { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
+declare <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll
index 16e1f3e63dce..5975f3b491d4 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll
@@ -13,10 +13,10 @@ define <vscale x 2 x i64> @complex_mul_v2i64(<vscale x 2 x i64> %a, <vscale x 2
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %a)
+  %a.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %a)
   %a.real = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %b)
+  %b.deinterleaved = tail call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> %b)
   %b.real = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } %b.deinterleaved, 1
   %0 = mul <vscale x 1 x i64> %b.imag, %a.real
@@ -25,7 +25,7 @@ entry:
   %3 = mul <vscale x 1 x i64> %b.real, %a.real
   %4 = mul <vscale x 1 x i64> %a.imag, %b.imag
   %5 = sub <vscale x 1 x i64> %3, %4
-  %interleaved.vec = tail call <vscale x 2 x i64> @llvm.experimental.vector.interleave2.nxv2i64(<vscale x 1 x i64> %5, <vscale x 1 x i64> %2)
+  %interleaved.vec = tail call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> %5, <vscale x 1 x i64> %2)
   ret <vscale x 2 x i64> %interleaved.vec
 }
 
@@ -43,10 +43,10 @@ define <vscale x 4 x i64> @complex_mul_v4i64(<vscale x 4 x i64> %a, <vscale x 4
 ; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %a)
+  %a.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %a)
   %a.real = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %b)
+  %b.deinterleaved = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %b)
   %b.real = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %b.deinterleaved, 1
   %0 = mul <vscale x 2 x i64> %b.imag, %a.real
@@ -55,7 +55,7 @@ entry:
   %3 = mul <vscale x 2 x i64> %b.real, %a.real
   %4 = mul <vscale x 2 x i64> %a.imag, %b.imag
   %5 = sub <vscale x 2 x i64> %3, %4
-  %interleaved.vec = tail call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %5, <vscale x 2 x i64> %2)
+  %interleaved.vec = tail call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %5, <vscale x 2 x i64> %2)
   ret <vscale x 4 x i64> %interleaved.vec
 }
 
@@ -81,10 +81,10 @@ define <vscale x 8 x i64> @complex_mul_v8i64(<vscale x 8 x i64> %a, <vscale x 8
 ; CHECK-NEXT:    mov z2.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
   %a.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %b)
   %b.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 1
   %0 = mul <vscale x 4 x i64> %b.imag, %a.real
@@ -93,7 +93,7 @@ entry:
   %3 = mul <vscale x 4 x i64> %b.real, %a.real
   %4 = mul <vscale x 4 x i64> %a.imag, %b.imag
   %5 = sub <vscale x 4 x i64> %3, %4
-  %interleaved.vec = tail call <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64> %5, <vscale x 4 x i64> %2)
+  %interleaved.vec = tail call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> %5, <vscale x 4 x i64> %2)
   ret <vscale x 8 x i64> %interleaved.vec
 }
 
@@ -119,11 +119,11 @@ define <vscale x 8 x i64> @complex_minus_mul_v8i64(<vscale x 8 x i64> %a, <vscal
 ; CHECK-NEXT:    mov z2.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
   %a.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %a.deinterleaved, 1
   %0 = sub <vscale x 4 x i64> zeroinitializer, %a.real
-  %b.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %b)
   %b.real = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } %b.deinterleaved, 1
   %1 = mul <vscale x 4 x i64> %b.real, %0
@@ -132,15 +132,15 @@ entry:
   %4 = mul <vscale x 4 x i64> %b.real, %a.imag
   %5 = mul <vscale x 4 x i64> %b.imag, %0
   %6 = sub <vscale x 4 x i64> %5, %4
-  %interleaved.vec = tail call <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64> %3, <vscale x 4 x i64> %6)
+  %interleaved.vec = tail call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> %3, <vscale x 4 x i64> %6)
   ret <vscale x 8 x i64> %interleaved.vec
 }
 
-declare { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.experimental.vector.deinterleave2.nxv2i64(<vscale x 2 x i64>)
-declare <vscale x 2 x i64> @llvm.experimental.vector.interleave2.nxv2i64(<vscale x 1 x i64>, <vscale x 1 x i64>)
+declare { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64>, <vscale x 1 x i64>)
 
-declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
+declare <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
 
-declare { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
-declare <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
+declare { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
+declare <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll
index b631486137e6..81872c1723f2 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll
@@ -22,15 +22,15 @@ define <vscale x 8 x i8> @complex_add_v8i8(<vscale x 8 x i8> %a, <vscale x 8 x i
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.experimental.vector.deinterleave2.nxv8i8(<vscale x 8 x i8> %a)
+  %a.deinterleaved = tail call { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.vector.deinterleave2.nxv8i8(<vscale x 8 x i8> %a)
   %a.real = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.experimental.vector.deinterleave2.nxv8i8(<vscale x 8 x i8> %b)
+  %b.deinterleaved = tail call { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.vector.deinterleave2.nxv8i8(<vscale x 8 x i8> %b)
   %b.real = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } %b.deinterleaved, 1
   %0 = sub <vscale x 4 x i8> %b.real, %a.imag
   %1 = add <vscale x 4 x i8> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 8 x i8> @llvm.experimental.vector.interleave2.nxv8i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1)
+  %interleaved.vec = tail call <vscale x 8 x i8> @llvm.vector.interleave2.nxv8i8(<vscale x 4 x i8> %0, <vscale x 4 x i8> %1)
   ret <vscale x 8 x i8> %interleaved.vec
 }
 
@@ -42,15 +42,15 @@ define <vscale x 16 x i8> @complex_add_v16i8(<vscale x 16 x i8> %a, <vscale x 16
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.experimental.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %a)
+  %a.deinterleaved = tail call { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %a)
   %a.real = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.experimental.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %b)
+  %b.deinterleaved = tail call { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %b)
   %b.real = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %b.deinterleaved, 1
   %0 = sub <vscale x 8 x i8> %b.real, %a.imag
   %1 = add <vscale x 8 x i8> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 16 x i8> @llvm.experimental.vector.interleave2.nxv16i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1)
+  %interleaved.vec = tail call <vscale x 16 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 8 x i8> %0, <vscale x 8 x i8> %1)
   ret <vscale x 16 x i8> %interleaved.vec
 }
 
@@ -64,23 +64,23 @@ define <vscale x 32 x i8> @complex_add_v32i8(<vscale x 32 x i8> %a, <vscale x 32
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
-  %a.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %a)
+  %a.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %a)
   %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
   %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
-  %b.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %b)
+  %b.deinterleaved = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %b)
   %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
   %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
   %0 = sub <vscale x 16 x i8> %b.real, %a.imag
   %1 = add <vscale x 16 x i8> %b.imag, %a.real
-  %interleaved.vec = tail call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1)
+  %interleaved.vec = tail call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %0, <vscale x 16 x i8> %1)
   ret <vscale x 32 x i8> %interleaved.vec
 }
 
-declare { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.experimental.vector.deinterleave2.nxv8i8(<vscale x 8 x i8>)
-declare <vscale x 8 x i8> @llvm.experimental.vector.interleave2.nxv8i8(<vscale x 4 x i8>, <vscale x 4 x i8>)
+declare { <vscale x 4 x i8>, <vscale x 4 x i8> } @llvm.vector.deinterleave2.nxv8i8(<vscale x 8 x i8>)
+declare <vscale x 8 x i8> @llvm.vector.interleave2.nxv8i8(<vscale x 4 x i8>, <vscale x 4 x i8>)
 
-declare { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.experimental.vector.deinterleave2.nxv16i8(<vscale x 16 x i8>)
-declare <vscale x 16 x i8> @llvm.experimental.vector.interleave2.nxv16i8(<vscale x 8 x i8>, <vscale x 8 x i8>)
+declare { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8>)
+declare <vscale x 16 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 8 x i8>, <vscale x 8 x i8>)
 
-declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
-declare <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
+declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
+declare <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
index 19318fdeeca7..ac2b21af29ab 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
@@ -69,14 +69,14 @@ vector.body:                                      ; preds = %vector.body, %entry
   %vec.phi27 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %16, %vector.body ]
   %scevgep = getelementptr i8, ptr %a, i64 %lsr.iv
   %scevgep34 = getelementptr i8, ptr %b, i64 %lsr.iv
-  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %active.lane.mask)
+  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %active.lane.mask)
   %wide.masked.vec = tail call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %scevgep, i32 8, <vscale x 4 x i1> %interleaved.mask, <vscale x 4 x double> poison)
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec)
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %interleaved.mask28 = tail call <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %active.lane.mask)
+  %interleaved.mask28 = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %active.lane.mask)
   %wide.masked.vec29 = tail call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %scevgep34, i32 8, <vscale x 4 x i1> %interleaved.mask28, <vscale x 4 x double> poison)
-  %strided.vec30 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec29)
+  %strided.vec30 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec29)
   %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec30, 0
   %6 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec30, 1
   %7 = fmul fast <vscale x 2 x double> %6, %3
@@ -175,13 +175,13 @@ vector.body:                                      ; preds = %vector.body, %entry
   %4 = icmp ne <vscale x 2 x i32> %wide.load, zeroinitializer
   %scevgep49 = getelementptr i8, ptr %a, i64 %lsr.iv48
   %scevgep50 = getelementptr i8, ptr %b, i64 %lsr.iv48
-  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1> %4, <vscale x 2 x i1> %4)
+  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %4, <vscale x 2 x i1> %4)
   %wide.masked.vec = tail call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %scevgep49, i32 8, <vscale x 4 x i1> %interleaved.mask, <vscale x 4 x double> poison)
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec)
   %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %6 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
   %wide.masked.vec32 = tail call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %scevgep50, i32 8, <vscale x 4 x i1> %interleaved.mask, <vscale x 4 x double> poison)
-  %strided.vec33 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec32)
+  %strided.vec33 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec32)
   %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec33, 0
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec33, 1
   %9 = fmul fast <vscale x 2 x double> %8, %5
@@ -279,14 +279,14 @@ vector.body:                                      ; preds = %vector.body, %entry
   %scevgep38 = getelementptr i8, ptr %a, i64 %lsr.iv
   %scevgep39 = getelementptr i8, ptr %b, i64 %lsr.iv
   %5 = select <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %4, <vscale x 2 x i1> zeroinitializer
-  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1> %5, <vscale x 2 x i1> %5)
+  %interleaved.mask = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %5, <vscale x 2 x i1> %5)
   %wide.masked.vec = tail call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %scevgep38, i32 8, <vscale x 4 x i1> %interleaved.mask, <vscale x 4 x double> poison)
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec)
   %6 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %interleaved.mask31 = tail call <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1> %5, <vscale x 2 x i1> %5)
+  %interleaved.mask31 = tail call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %5, <vscale x 2 x i1> %5)
   %wide.masked.vec32 = tail call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %scevgep39, i32 8, <vscale x 4 x i1> %interleaved.mask31, <vscale x 4 x double> poison)
-  %strided.vec33 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec32)
+  %strided.vec33 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec32)
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec33, 0
   %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec33, 1
   %10 = fmul fast <vscale x 2 x double> %9, %6
@@ -320,6 +320,6 @@ declare i64 @llvm.vscale.i64()
 declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64)
 declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr nocapture, i32 immarg, <vscale x 2 x i1>, <vscale x 2 x i32>)
 declare <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr nocapture, i32 immarg, <vscale x 4 x i1>, <vscale x 4 x double>)
-declare <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1>, <vscale x 2 x i1>)
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1>, <vscale x 2 x i1>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
 declare double @llvm.vector.reduce.fadd.nxv2f64(double, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
index 5bef95910d90..af07519ad53d 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -64,11 +64,11 @@ vector.body:                                      ; preds = %vector.body, %entry
   %scevgep46 = getelementptr i8, ptr %a, i64 %lsr.iv27
   %scevgep47 = getelementptr i8, ptr %b, i64 %lsr.iv27
   %wide.vec = load <vscale x 4 x double>, ptr %scevgep46, align 8
-  %3 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
+  %3 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
   %4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 0
   %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 1
   %wide.vec30 = load <vscale x 4 x double>, ptr %scevgep47, align 8
-  %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec30)
+  %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec30)
   %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 0
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 1
   %9 = fmul fast <vscale x 2 x double> %8, %4
@@ -156,11 +156,11 @@ vector.body:                                      ; preds = %vector.body, %entry
   %scevgep46 = getelementptr i8, ptr %a, i64 %lsr.iv27
   %scevgep47 = getelementptr i8, ptr %b, i64 %lsr.iv27
   %wide.vec = load <vscale x 4 x double>, ptr %scevgep46, align 8
-  %3 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
+  %3 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
   %4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 0
   %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %3, 1
   %wide.vec30 = load <vscale x 4 x double>, ptr %scevgep47, align 8
-  %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec30)
+  %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec30)
   %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 0
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %6, 1
   %9 = fmul fast <vscale x 2 x double> %8, %4
@@ -266,16 +266,16 @@ vector.body:                                      ; preds = %vector.body, %entry
   %scevgep62 = getelementptr i8, ptr %scevgep61, i64 %lsr.iv34
   %wide.vec = load <vscale x 4 x double>, ptr %scevgep57, align 8
   %wide.vec32 = load <vscale x 4 x double>, ptr %scevgep64, align 8
-  %4 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
-  %5 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec32)
+  %4 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
+  %5 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec32)
   %6 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 0
   %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %5, 0
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 1
   %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %5, 1
   %wide.vec34 = load <vscale x 4 x double>, ptr %scevgep58, align 8
   %wide.vec35 = load <vscale x 4 x double>, ptr %scevgep62, align 8
-  %10 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec34)
-  %11 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec35)
+  %10 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec34)
+  %11 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec35)
   %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %10, 0
   %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %11, 0
   %14 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %10, 1
@@ -375,7 +375,7 @@ vector.body:                                      ; preds = %vector.body, %entry
   %5 = add <vscale x 2 x i32> %wide.load, %vec.phi
   %6 = getelementptr inbounds %"class.std::complex", ptr %a, i64 %index
   %wide.vec = load <vscale x 4 x double>, ptr %6, align 8
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.vec)
   %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
   %9 = fadd fast <vscale x 2 x double> %7, %vec.phi13
@@ -396,6 +396,6 @@ middle.block:                                     ; preds = %vector.body
 
 
 declare i64 @llvm.vscale.i64()
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
 declare double @llvm.vector.reduce.fadd.nxv2f64(double, <vscale x 2 x double>)
 declare i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
index 17bf5ba6eb48..b4425c0c01e1 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
@@ -28,10 +28,10 @@ define <vscale x 4 x double> @complex_mul_const(<vscale x 4 x double> %a, <vscal
 ; CHECK-NEXT:    mov z1.d, z4.d
 ; CHECK-NEXT:    ret
 entry:
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec48 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec48 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec48, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec48, 1
   %4 = fmul fast <vscale x 2 x double> %3, %0
@@ -46,7 +46,7 @@ entry:
   %13 = fmul fast <vscale x 2 x double> %9, splat (double 1.100000e+01)
   %14 = fmul fast <vscale x 2 x double> %6, splat (double 3.000000e+00)
   %15 = fsub fast <vscale x 2 x double> %13, %14
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %15, <vscale x 2 x double> %12)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %15, <vscale x 2 x double> %12)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
@@ -83,10 +83,10 @@ entry:
   %broadcast.splat = shufflevector <vscale x 2 x double> %broadcast.splatinsert, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
   %broadcast.splatinsert49 = insertelement <vscale x 2 x double> poison, double %c.coerce.fca.0.extract, i64 0
   %broadcast.splat50 = shufflevector <vscale x 2 x double> %broadcast.splatinsert49, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
-  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
   %0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
   %1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
-  %strided.vec48 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %strided.vec48 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
   %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec48, 0
   %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec48, 1
   %4 = fmul fast <vscale x 2 x double> %3, %0
@@ -101,9 +101,9 @@ entry:
   %13 = fmul fast <vscale x 2 x double> %9, %broadcast.splat50
   %14 = fmul fast <vscale x 2 x double> %6, %broadcast.splat
   %15 = fsub fast <vscale x 2 x double> %13, %14
-  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %15, <vscale x 2 x double> %12)
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %15, <vscale x 2 x double> %12)
   ret <vscale x 4 x double> %interleaved.vec
 }
 
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
index 2ad5623b6551..c58db8290c87 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
@@ -25,7 +25,7 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-GI-NEXT:    fmov d0, d2
 ; CHECK-GI-NEXT:    ret
-  %retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec)
+  %retval = call {<2 x half>, <2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half> %vec)
   ret {<2 x half>, <2 x half>}   %retval
 }
 
@@ -45,7 +45,7 @@ define {<4 x half>, <4 x half>} @vector_deinterleave_v4f16_v8f16(<8 x half> %vec
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-GI-NEXT:    fmov d0, d2
 ; CHECK-GI-NEXT:    ret
-  %retval = call {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %vec)
+  %retval = call {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half> %vec)
   ret {<4 x half>, <4 x half>}   %retval
 }
 
@@ -56,7 +56,7 @@ define {<8 x half>, <8 x half>} @vector_deinterleave_v8f16_v16f16(<16 x half> %v
 ; CHECK-NEXT:    uzp2 v1.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %vec)
+  %retval = call {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half> %vec)
   ret {<8 x half>, <8 x half>}   %retval
 }
 
@@ -76,7 +76,7 @@ define {<2 x float>, <2 x float>} @vector_deinterleave_v2f32_v4f32(<4 x float> %
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-GI-NEXT:    fmov d0, d2
 ; CHECK-GI-NEXT:    ret
-  %retval = call {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float> %vec)
+  %retval = call {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float> %vec)
   ret {<2 x float>, <2 x float>}   %retval
 }
 
@@ -87,7 +87,7 @@ define {<4 x float>, <4 x float>} @vector_deinterleave_v4f32_v8f32(<8 x float> %
 ; CHECK-NEXT:    uzp2 v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %vec)
+  %retval = call {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float> %vec)
 ret  {<4 x float>, <4 x float>}   %retval
 }
 
@@ -98,7 +98,7 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double
 ; CHECK-NEXT:    zip2 v1.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec)
+  %retval = call {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double> %vec)
   ret {<2 x double>, <2 x double>}   %retval
 }
 
@@ -111,7 +111,7 @@ define {<16 x i8>, <16 x i8>} @vector_deinterleave_v16i8_v32i8(<32 x i8> %vec) {
 ; CHECK-NEXT:    uzp2 v1.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %vec)
+  %retval = call {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8> %vec)
   ret {<16 x i8>, <16 x i8>}   %retval
 }
 
@@ -122,7 +122,7 @@ define {<8 x i16>, <8 x i16>} @vector_deinterleave_v8i16_v16i16(<16 x i16> %vec)
 ; CHECK-NEXT:    uzp2 v1.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %vec)
+  %retval = call {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16> %vec)
   ret {<8 x i16>, <8 x i16>}   %retval
 }
 
@@ -133,7 +133,7 @@ define {<4 x i32>, <4 x i32>} @vector_deinterleave_v4i32_v8i32(<8 x i32> %vec) {
 ; CHECK-NEXT:    uzp2 v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %vec)
+  %retval = call {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32> %vec)
   ret {<4 x i32>, <4 x i32>}   %retval
 }
 
@@ -144,22 +144,22 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) {
 ; CHECK-NEXT:    zip2 v1.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec)
+  %retval = call {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64> %vec)
   ret {<2 x i64>, <2 x i64>}   %retval
 }
 
 
 ; Floating declarations
-declare {<2 x half>,<2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>)
-declare {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>)
-declare {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float>)
-declare {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>)
-declare {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>)
-declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>)
+declare {<2 x half>,<2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half>)
+declare {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half>)
+declare {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float>)
+declare {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half>)
+declare {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float>)
+declare {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double>)
 
 ; Integer declarations
-declare {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>)
-declare {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>)
-declare {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>)
-declare {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>)
+declare {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8>)
+declare {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16>)
+declare {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32>)
+declare {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64>)
 
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
index eb81aff33e49..2e992964f598 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
@@ -7,7 +7,7 @@ define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    zip1 v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
-  %retval = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %vec0, <2 x half> %vec1)
+  %retval = call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %vec0, <2 x half> %vec1)
   ret <4 x half> %retval
 }
 
@@ -28,7 +28,7 @@ define <8 x half> @interleave2_v8f16(<4 x half> %vec0, <4 x half> %vec1) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    zip1 v0.8h, v0.8h, v1.8h
 ; CHECK-GI-NEXT:    ret
-  %retval = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %vec0, <4 x half> %vec1)
+  %retval = call <8 x half> @llvm.vector.interleave2.v8f16(<4 x half> %vec0, <4 x half> %vec1)
   ret <8 x half> %retval
 }
 
@@ -39,7 +39,7 @@ define <16 x half> @interleave2_v16f16(<8 x half> %vec0, <8 x half> %vec1) {
 ; CHECK-NEXT:    zip2 v1.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %vec0, <8 x half> %vec1)
+  %retval = call <16 x half> @llvm.vector.interleave2.v16f16(<8 x half> %vec0, <8 x half> %vec1)
   ret <16 x half> %retval
 }
 
@@ -59,7 +59,7 @@ define <4 x float> @interleave2_v4f32(<2 x float> %vec0, <2 x float> %vec1) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    zip1 v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    ret
-  %retval = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %vec0, <2 x float> %vec1)
+  %retval = call <4 x float> @llvm.vector.interleave2.v4f32(<2 x float> %vec0, <2 x float> %vec1)
   ret <4 x float> %retval
 }
 
@@ -70,7 +70,7 @@ define <8 x float> @interleave2_v8f32(<4 x float> %vec0, <4 x float> %vec1) {
 ; CHECK-NEXT:    zip2 v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %vec0, <4 x float> %vec1)
+  %retval = call <8 x float> @llvm.vector.interleave2.v8f32(<4 x float> %vec0, <4 x float> %vec1)
   ret <8 x float> %retval
 }
 
@@ -81,7 +81,7 @@ define <4 x double> @interleave2_v4f64(<2 x double> %vec0, <2 x double> %vec1) {
 ; CHECK-NEXT:    zip2 v1.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call <4 x double>@llvm.experimental.vector.interleave2.v4f64(<2 x double> %vec0, <2 x double> %vec1)
+  %retval = call <4 x double>@llvm.vector.interleave2.v4f64(<2 x double> %vec0, <2 x double> %vec1)
   ret <4 x double> %retval
 }
 
@@ -94,7 +94,7 @@ define <32 x i8> @interleave2_v32i8(<16 x i8> %vec0, <16 x i8> %vec1) {
 ; CHECK-NEXT:    zip2 v1.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
- %retval = call <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8> %vec0, <16 x i8> %vec1)
+ %retval = call <32 x i8> @llvm.vector.interleave2.v32i8(<16 x i8> %vec0, <16 x i8> %vec1)
   ret <32 x i8> %retval
 }
 
@@ -105,7 +105,7 @@ define <16 x i16> @interleave2_v16i16(<8 x i16> %vec0, <8 x i16> %vec1) {
 ; CHECK-NEXT:    zip2 v1.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %vec0, <8 x i16> %vec1)
+  %retval = call <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16> %vec0, <8 x i16> %vec1)
   ret <16 x i16> %retval
 }
 
@@ -116,7 +116,7 @@ define <8 x i32> @interleave2_v8i32(<4 x i32> %vec0, <4 x i32> %vec1) {
 ; CHECK-NEXT:    zip2 v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %vec0, <4 x i32> %vec1)
+  %retval = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %vec0, <4 x i32> %vec1)
   ret <8 x i32> %retval
 }
 
@@ -127,22 +127,22 @@ define <4 x i64> @interleave2_v4i64(<2 x i64> %vec0, <2 x i64> %vec1) {
 ; CHECK-NEXT:    zip2 v1.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
-  %retval = call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %vec0, <2 x i64> %vec1)
+  %retval = call <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64> %vec0, <2 x i64> %vec1)
   ret <4 x i64> %retval
 }
 
 
 ; Float declarations
-declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>)
-declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>)
-declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>)
-declare <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float>, <2 x float>)
-declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>)
-declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>)
+declare <4 x half> @llvm.vector.interleave2.v4f16(<2 x half>, <2 x half>)
+declare <8 x half> @llvm.vector.interleave2.v8f16(<4 x half>, <4 x half>)
+declare <16 x half> @llvm.vector.interleave2.v16f16(<8 x half>, <8 x half>)
+declare <4 x float> @llvm.vector.interleave2.v4f32(<2 x float>, <2 x float>)
+declare <8 x float> @llvm.vector.interleave2.v8f32(<4 x float>, <4 x float>)
+declare <4 x double> @llvm.vector.interleave2.v4f64(<2 x double>, <2 x double>)
 
 ; Integer declarations
-declare <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8>, <16 x i8>)
-declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
-declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
-declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
+declare <32 x i8> @llvm.vector.interleave2.v32i8(<16 x i8>, <16 x i8>)
+declare <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
+declare <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
 
diff --git a/llvm/test/CodeGen/AArch64/fpmode.ll b/llvm/test/CodeGen/AArch64/fpmode.ll
index ebfb0696a95a..b185d9e04941 100644
--- a/llvm/test/CodeGen/AArch64/fpmode.ll
+++ b/llvm/test/CodeGen/AArch64/fpmode.ll
@@ -6,17 +6,14 @@ declare i32 @llvm.get.fpmode.i32()
 declare void @llvm.set.fpmode.i32(i32 %fpmode)
 declare void @llvm.reset.fpmode()
 
-define i32 @func_get_fpmode_soft() #0 {
-; DAG-LABEL: func_get_fpmode_soft:
+define i32 @func_get_fpmode() #0 {
+; DAG-LABEL: func_get_fpmode:
 ; DAG:       // %bb.0: // %entry
-; DAG-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; DAG-NEXT:    add x0, sp, #12
-; DAG-NEXT:    bl fegetmode
-; DAG-NEXT:    ldr w0, [sp, #12]
-; DAG-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; DAG-NEXT:    mrs x0, FPCR
+; DAG-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; DAG-NEXT:    ret
 ;
-; GIS-LABEL: func_get_fpmode_soft:
+; GIS-LABEL: func_get_fpmode:
 ; GIS:       // %bb.0: // %entry
 ; GIS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GIS-NEXT:    add x0, sp, #12
@@ -29,17 +26,14 @@ entry:
   ret i32 %fpmode
 }
 
-define void @func_set_fpmode_soft(i32 %fpmode) #0 {
-; DAG-LABEL: func_set_fpmode_soft:
+define void @func_set_fpmode(i32 %fpmode) #0 {
+; DAG-LABEL: func_set_fpmode:
 ; DAG:       // %bb.0: // %entry
-; DAG-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; DAG-NEXT:    str w0, [sp, #12]
-; DAG-NEXT:    add x0, sp, #12
-; DAG-NEXT:    bl fesetmode
-; DAG-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; DAG-NEXT:    mov w8, w0
+; DAG-NEXT:    msr FPCR, x8
 ; DAG-NEXT:    ret
 ;
-; GIS-LABEL: func_set_fpmode_soft:
+; GIS-LABEL: func_set_fpmode:
 ; GIS:       // %bb.0: // %entry
 ; GIS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GIS-NEXT:    str w0, [sp, #12]
@@ -52,16 +46,17 @@ entry:
   ret void
 }
 
-define void @func_reset_fpmode_soft() #0 {
-; DAG-LABEL: func_reset_fpmode_soft:
+define void @func_reset_fpmode() #0 {
+; DAG-LABEL: func_reset_fpmode:
 ; DAG:       // %bb.0: // %entry
-; DAG-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; DAG-NEXT:    mov x0, #-1 // =0xffffffffffffffff
-; DAG-NEXT:    bl fesetmode
-; DAG-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; DAG-NEXT:    mov x9, #-48904 // =0xffffffffffff40f8
+; DAG-NEXT:    mrs x8, FPCR
+; DAG-NEXT:    movk x9, #63488, lsl #16
+; DAG-NEXT:    and x8, x8, x9
+; DAG-NEXT:    msr FPCR, x8
 ; DAG-NEXT:    ret
 ;
-; GIS-LABEL: func_reset_fpmode_soft:
+; GIS-LABEL: func_reset_fpmode:
 ; GIS:       // %bb.0: // %entry
 ; GIS-NEXT:    mov x0, #-1 // =0xffffffffffffffff
 ; GIS-NEXT:    b fesetmode
@@ -70,4 +65,4 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "use-soft-float"="true" }
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-subadd2.mir b/llvm/test/CodeGen/AArch64/machine-combiner-subadd2.mir
index d1770bb25fae..0b09e8a4b5cd 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner-subadd2.mir
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-subadd2.mir
@@ -237,3 +237,30 @@ body:              |
     RET_ReallyLR implicit $w0
 
 ...
+---
+# Drop nowrap flags in SUB
+
+# CHECK-LABEL: name: test8
+# CHECK:       %7:gpr64 = SUBXrr %1, %0
+# CHECK-NEXT:  %4:gpr64common = SUBXrr killed %7, killed %2
+
+name:            test8
+registers:
+  - { id: 0, class: gpr64 }
+  - { id: 1, class: gpr64 }
+  - { id: 2, class: gpr64common }
+  - { id: 3, class: gpr64 }
+  - { id: 4, class: gpr64common }
+  - { id: 5, class: gpr64 }
+body:              |
+  bb.0:
+    %1:gpr64 = COPY $x1
+    %0:gpr64 = COPY $x0
+    %2:gpr64common = ORRXri %0:gpr64, 4096
+    %3:gpr64 = ADDXrr killed %2:gpr64common, %0:gpr64
+    %4:gpr64common = nsw SUBSXrr %1:gpr64, killed %3:gpr64, implicit-def dead $nzcv
+    %5:gpr64 = SUBSXri %4:gpr64common, 0, 0, implicit-def $nzcv
+    $x0 = COPY %5:gpr64
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-neon.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-neon.ll
index 0eee19ad2adb..cff7759c72c9 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-neon.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-neon.ll
@@ -15,7 +15,7 @@ define <16 x i8> @reverse_v16i8(<16 x i8> %a) #0 {
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ret
 
-  %res = call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> %a)
+  %res = call <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8> %a)
   ret <16 x i8> %res
 }
 
@@ -26,7 +26,7 @@ define <8 x i16> @reverse_v8i16(<8 x i16> %a) #0 {
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ret
 
-  %res = call <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16> %a)
+  %res = call <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16> %a)
   ret <8 x i16> %res
 }
 
@@ -35,7 +35,7 @@ define <2 x i16> @reverse_v2i16(<2 x i16> %a) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-NEXT:    ret
-  %res = call <2 x i16> @llvm.experimental.vector.reverse.v2i16(<2 x i16> %a)
+  %res = call <2 x i16> @llvm.vector.reverse.v2i16(<2 x i16> %a)
   ret <2 x i16> %res
 }
 
@@ -44,7 +44,7 @@ define <2 x i32> @reverse_v2i32(<2 x i32> %a) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-NEXT:    ret
-  %res = call <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32> %a)
+  %res = call <2 x i32> @llvm.vector.reverse.v2i32(<2 x i32> %a)
   ret <2 x i32> %res
 }
 
@@ -55,7 +55,7 @@ define <4 x i32> @reverse_v4i32(<4 x i32> %a) #0 {
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ret
 
-  %res = call <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32> %a)
+  %res = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> %a)
   ret <4 x i32> %res
 }
 
@@ -65,7 +65,7 @@ define <2 x i64> @reverse_v2i64(<2 x i64> %a) #0 {
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ret
 
-  %res = call <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64> %a)
+  %res = call <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64> %a)
   ret <2 x i64> %res
 }
 
@@ -76,7 +76,7 @@ define <8 x half> @reverse_v8f16(<8 x half> %a) #0 {
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ret
 
-  %res = call <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half> %a)
+  %res = call <8 x half> @llvm.vector.reverse.v8f16(<8 x half> %a)
   ret <8 x half> %res
 }
 
@@ -85,7 +85,7 @@ define <2 x float> @reverse_v2f32(<2 x float> %a) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-NEXT:    ret
-  %res = call <2 x float> @llvm.experimental.vector.reverse.v2f32(<2 x float> %a)
+  %res = call <2 x float> @llvm.vector.reverse.v2f32(<2 x float> %a)
   ret <2 x float> %res
 }
 
@@ -96,7 +96,7 @@ define <4 x float> @reverse_v4f32(<4 x float> %a) #0 {
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ret
 
-  %res = call <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float> %a)
+  %res = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> %a)
   ret <4 x float> %res
 }
 
@@ -106,7 +106,7 @@ define <2 x double> @reverse_v2f64(<2 x double> %a) #0 {
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ret
 
-  %res = call <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double> %a)
+  %res = call <2 x double> @llvm.vector.reverse.v2f64(<2 x double> %a)
   ret <2 x double> %res
 }
 
@@ -117,7 +117,7 @@ define <2 x i8> @reverse_v2i8(<2 x i8> %a) #0 {
 ; CHECK-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-NEXT:    ret
 
-  %res = call <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8> %a)
+  %res = call <2 x i8> @llvm.vector.reverse.v2i8(<2 x i8> %a)
   ret <2 x i8> %res
 }
 
@@ -144,7 +144,7 @@ define <8 x i32> @reverse_v8i32(<8 x i32> %a) #0 {
 ; CHECK-FASTISEL-NEXT:    add sp, sp, #16
 ; CHECK-FASTISEL-NEXT:    ret
 
-  %res = call <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32> %a)
+  %res = call <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32> %a)
   ret <8 x i32> %res
 }
 
@@ -182,23 +182,23 @@ define <16 x float> @reverse_v16f32(<16 x float> %a) #0 {
 ; CHECK-FASTISEL-NEXT:    add sp, sp, #32
 ; CHECK-FASTISEL-NEXT:    ret
 
-  %res = call <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float> %a)
+  %res = call <16 x float> @llvm.vector.reverse.v16f32(<16 x float> %a)
   ret <16 x float> %res
 }
 
 
-declare <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8>)
-declare <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8>)
-declare <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16>)
-declare <2 x i16> @llvm.experimental.vector.reverse.v2i16(<2 x i16>)
-declare <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32>)
-declare <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32>)
-declare <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32>)
-declare <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64>)
-declare <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half>)
-declare <2 x float> @llvm.experimental.vector.reverse.v2f32(<2 x float>)
-declare <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float>)
-declare <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float>)
-declare <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double>)
+declare <2 x i8> @llvm.vector.reverse.v2i8(<2 x i8>)
+declare <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8>)
+declare <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16>)
+declare <2 x i16> @llvm.vector.reverse.v2i16(<2 x i16>)
+declare <2 x i32> @llvm.vector.reverse.v2i32(<2 x i32>)
+declare <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32>)
+declare <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32>)
+declare <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64>)
+declare <8 x half> @llvm.vector.reverse.v8f16(<8 x half>)
+declare <2 x float> @llvm.vector.reverse.v2f32(<2 x float>)
+declare <4 x float> @llvm.vector.reverse.v4f32(<4 x float>)
+declare <16 x float> @llvm.vector.reverse.v16f32(<16 x float>)
+declare <2 x double> @llvm.vector.reverse.v2f64(<2 x double>)
 
 attributes #0 = { nounwind "target-features"="+neon" }
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-sve.ll
index 4d5045feca08..a84e6e7bcae8 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-sve.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-sve.ll
@@ -14,7 +14,7 @@ define <vscale x 2 x i1> @reverse_nxv2i1(<vscale x 2 x i1> %a) #0 {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %a)
+  %res = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %a)
   ret <vscale x 2 x i1> %res
 }
 
@@ -24,7 +24,7 @@ define <vscale x 4 x i1> @reverse_nxv4i1(<vscale x 4 x i1> %a) #0 {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %res = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
   ret <vscale x 4 x i1> %res
 }
 
@@ -34,7 +34,7 @@ define <vscale x 8 x i1> @reverse_nxv8i1(<vscale x 8 x i1> %a) #0 {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %a)
+  %res = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %a)
   ret <vscale x 8 x i1> %res
 }
 
@@ -44,7 +44,7 @@ define <vscale x 16 x i1> @reverse_nxv16i1(<vscale x 16 x i1> %a) #0 {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %a)
+  %res = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %a)
   ret <vscale x 16 x i1> %res
 }
 
@@ -70,7 +70,7 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) #0 {
 ; CHECK-FASTISEL-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-FASTISEL-NEXT:    ret
 
-  %res = call <vscale x 32 x i1> @llvm.experimental.vector.reverse.nxv32i1(<vscale x 32 x i1> %a)
+  %res = call <vscale x 32 x i1> @llvm.vector.reverse.nxv32i1(<vscale x 32 x i1> %a)
   ret <vscale x 32 x i1> %res
 }
 
@@ -84,7 +84,7 @@ define <vscale x 16 x i8> @reverse_nxv16i8(<vscale x 16 x i8> %a) #0 {
 ; CHECK-NEXT:    rev z0.b, z0.b
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> %a)
+  %res = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> %a)
   ret <vscale x 16 x i8> %res
 }
 
@@ -94,7 +94,7 @@ define <vscale x 8 x i16> @reverse_nxv8i16(<vscale x 8 x i16> %a) #0 {
 ; CHECK-NEXT:    rev z0.h, z0.h
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> %a)
+  %res = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> %a)
   ret <vscale x 8 x i16> %res
 }
 
@@ -104,7 +104,7 @@ define <vscale x 4 x i32> @reverse_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ; CHECK-NEXT:    rev z0.s, z0.s
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %res = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   ret <vscale x 4 x i32> %res
 }
 
@@ -114,7 +114,7 @@ define <vscale x 2 x i64> @reverse_nxv2i64(<vscale x 2 x i64> %a) #0 {
 ; CHECK-NEXT:    rev z0.d, z0.d
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> %a)
+  %res = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> %a)
   ret <vscale x 2 x i64> %res
 }
 
@@ -124,7 +124,7 @@ define <vscale x 2 x half> @reverse_nxv2f16(<vscale x 2 x half> %a) #0 {
 ; CHECK-NEXT:    rev z0.d, z0.d
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half> %a)
+  %res = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> %a)
   ret <vscale x 2 x half> %res
 }
 
@@ -134,7 +134,7 @@ define <vscale x 4 x half> @reverse_nxv4f16(<vscale x 4 x half> %a) #0 {
 ; CHECK-NEXT:    rev z0.s, z0.s
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half> %a)
+  %res = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> %a)
   ret <vscale x 4 x half> %res
 }
 
@@ -144,7 +144,7 @@ define <vscale x 8 x half> @reverse_nxv8f16(<vscale x 8 x half> %a) #0 {
 ; CHECK-NEXT:    rev z0.h, z0.h
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half> %a)
+  %res = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> %a)
   ret <vscale x 8 x half> %res
 }
 
@@ -154,7 +154,7 @@ define <vscale x 2 x bfloat> @reverse_nxv2bf16(<vscale x 2 x bfloat> %a) #1 {
 ; CHECK-NEXT:    rev z0.d, z0.d
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 2 x bfloat> @llvm.experimental.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> %a)
+  %res = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> %a)
   ret <vscale x 2 x bfloat> %res
 }
 
@@ -164,7 +164,7 @@ define <vscale x 4 x bfloat> @reverse_nxv4bf16(<vscale x 4 x bfloat> %a) #1 {
 ; CHECK-NEXT:    rev z0.s, z0.s
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 4 x bfloat> @llvm.experimental.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> %a)
+  %res = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> %a)
   ret <vscale x 4 x bfloat> %res
 }
 
@@ -174,7 +174,7 @@ define <vscale x 8 x bfloat> @reverse_nxv8bf16(<vscale x 8 x bfloat> %a) #1 {
 ; CHECK-NEXT:    rev z0.h, z0.h
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 8 x bfloat> @llvm.experimental.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> %a)
+  %res = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> %a)
   ret <vscale x 8 x bfloat> %res
 }
 
@@ -184,7 +184,7 @@ define <vscale x 2 x float> @reverse_nxv2f32(<vscale x 2 x float> %a) #0 {
 ; CHECK-NEXT:    rev z0.d, z0.d
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float> %a)  ret <vscale x 2 x float> %res
+  %res = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> %a)  ret <vscale x 2 x float> %res
 }
 
 define <vscale x 4 x float> @reverse_nxv4f32(<vscale x 4 x float> %a) #0 {
@@ -193,7 +193,7 @@ define <vscale x 4 x float> @reverse_nxv4f32(<vscale x 4 x float> %a) #0 {
 ; CHECK-NEXT:    rev z0.s, z0.s
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)  ret <vscale x 4 x float> %res
+  %res = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)  ret <vscale x 4 x float> %res
 }
 
 define <vscale x 2 x double> @reverse_nxv2f64(<vscale x 2 x double> %a) #0 {
@@ -202,7 +202,7 @@ define <vscale x 2 x double> @reverse_nxv2f64(<vscale x 2 x double> %a) #0 {
 ; CHECK-NEXT:    rev z0.d, z0.d
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> %a)
+  %res = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> %a)
   ret <vscale x 2 x double> %res
 }
 
@@ -213,7 +213,7 @@ define <vscale x 2 x i8> @reverse_nxv2i8(<vscale x 2 x i8> %a) #0 {
 ; CHECK-NEXT:    rev z0.d, z0.d
 ; CHECK-NEXT:    ret
 
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.reverse.nxv2i8(<vscale x 2 x i8> %a)
+  %res = call <vscale x 2 x i8> @llvm.vector.reverse.nxv2i8(<vscale x 2 x i8> %a)
   ret <vscale x 2 x i8> %res
 }
 
@@ -239,7 +239,7 @@ define <vscale x 8 x i32> @reverse_nxv8i32(<vscale x 8 x i32> %a) #0 {
 ; CHECK-FASTISEL-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-FASTISEL-NEXT:    ret
 
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> %a)
+  %res = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> %a)
   ret <vscale x 8 x i32> %res
 }
 
@@ -273,32 +273,32 @@ define <vscale x 16 x float> @reverse_nxv16f32(<vscale x 16 x float> %a) #0 {
 ; CHECK-FASTISEL-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-FASTISEL-NEXT:    ret
 
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.reverse.nxv16f32(<vscale x 16 x float> %a)
+  %res = call <vscale x 16 x float> @llvm.vector.reverse.nxv16f32(<vscale x 16 x float> %a)
   ret <vscale x 16 x float> %res
 }
 
 
-declare <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1>)
-declare <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1>)
-declare <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1>)
-declare <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1>)
-declare <vscale x 32 x i1> @llvm.experimental.vector.reverse.nxv32i1(<vscale x 32 x i1>)
-declare <vscale x 2 x i8> @llvm.experimental.vector.reverse.nxv2i8(<vscale x 2 x i8>)
-declare <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32>)
-declare <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64>)
-declare <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half>)
-declare <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half>)
-declare <vscale x 2 x bfloat> @llvm.experimental.vector.reverse.nxv2bf16(<vscale x 2 x bfloat>)
-declare <vscale x 4 x bfloat> @llvm.experimental.vector.reverse.nxv4bf16(<vscale x 4 x bfloat>)
-declare <vscale x 8 x bfloat> @llvm.experimental.vector.reverse.nxv8bf16(<vscale x 8 x bfloat>)
-declare <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float>)
-declare <vscale x 16 x float> @llvm.experimental.vector.reverse.nxv16f32(<vscale x 16 x float>)
-declare <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1>)
+declare <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1>)
+declare <vscale x 32 x i1> @llvm.vector.reverse.nxv32i1(<vscale x 32 x i1>)
+declare <vscale x 2 x i8> @llvm.vector.reverse.nxv2i8(<vscale x 2 x i8>)
+declare <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32>)
+declare <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half>)
+declare <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half>)
+declare <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half>)
+declare <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat>)
+declare <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat>)
+declare <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat>)
+declare <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float>)
+declare <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 16 x float> @llvm.vector.reverse.nxv16f32(<vscale x 16 x float>)
+declare <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double>)
 
 
 attributes #0 = { nounwind "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll
index 9210a5ec1c8b..f2e62bc4f3c8 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll
@@ -12,7 +12,7 @@ define <16 x i8> @splice_v16i8_idx(<16 x i8> %a, <16 x i8> %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #1
 ; CHECK-NEXT:    ret
-  %res = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
+  %res = call <16 x i8> @llvm.vector.splice.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
   ret <16 x i8> %res
 }
 
@@ -21,7 +21,7 @@ define <2 x double> @splice_v2f64_idx(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
 ; CHECK-NEXT:    ret
-  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 1)
+  %res = call <2 x double> @llvm.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 1)
   ret <2 x double> %res
 }
 
@@ -31,7 +31,7 @@ define <2 x i8> @splice_v2i8_idx(<2 x i8> %a, <2 x i8> %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
 ; CHECK-NEXT:    ret
-  %res = call <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8> %a, <2 x i8> %b, i32 1)
+  %res = call <2 x i8> @llvm.vector.splice.v2i8(<2 x i8> %a, <2 x i8> %b, i32 1)
   ret <2 x i8> %res
 }
 
@@ -42,7 +42,7 @@ define <8 x i32> @splice_v8i32_idx(<8 x i32> %a, <8 x i32> %b) #0 {
 ; CHECK-NEXT:    ext v0.16b, v1.16b, v2.16b, #4
 ; CHECK-NEXT:    ext v1.16b, v2.16b, v3.16b, #4
 ; CHECK-NEXT:    ret
-  %res = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 5)
+  %res = call <8 x i32> @llvm.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 5)
   ret <8 x i32> %res
 }
 
@@ -56,7 +56,7 @@ define <16 x float> @splice_v16f32_idx(<16 x float> %a, <16 x float> %b) #0 {
 ; CHECK-NEXT:    ext v3.16b, v4.16b, v5.16b, #12
 ; CHECK-NEXT:    mov v2.16b, v6.16b
 ; CHECK-NEXT:    ret
-  %res = call <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 7)
+  %res = call <16 x float> @llvm.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 7)
   ret <16 x float> %res
 }
 
@@ -69,7 +69,7 @@ define <16 x i8> @splice_v16i8(<16 x i8> %a, <16 x i8> %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #1
 ; CHECK-NEXT:    ret
-  %res = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> %a, <16 x i8> %b, i32 -15)
+  %res = call <16 x i8> @llvm.vector.splice.v16i8(<16 x i8> %a, <16 x i8> %b, i32 -15)
   ret <16 x i8> %res
 }
 
@@ -78,7 +78,7 @@ define <2 x double> @splice_v2f64(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
 ; CHECK-NEXT:    ret
-  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 -1)
+  %res = call <2 x double> @llvm.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 -1)
   ret <2 x double> %res
 }
 
@@ -88,7 +88,7 @@ define <2 x i8> @splice_v2i8(<2 x i8> %a, <2 x i8> %b) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
 ; CHECK-NEXT:    ret
-  %res = call <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8> %a, <2 x i8> %b, i32 -1)
+  %res = call <2 x i8> @llvm.vector.splice.v2i8(<2 x i8> %a, <2 x i8> %b, i32 -1)
   ret <2 x i8> %res
 }
 
@@ -99,7 +99,7 @@ define <8 x i32> @splice_v8i32(<8 x i32> %a, <8 x i32> %b) #0 {
 ; CHECK-NEXT:    ext v0.16b, v1.16b, v2.16b, #4
 ; CHECK-NEXT:    ext v1.16b, v2.16b, v3.16b, #4
 ; CHECK-NEXT:    ret
-  %res = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 -3)
+  %res = call <8 x i32> @llvm.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 -3)
   ret <8 x i32> %res
 }
 
@@ -113,14 +113,14 @@ define <16 x float> @splice_v16f32(<16 x float> %a, <16 x float> %b) #0 {
 ; CHECK-NEXT:    ext v3.16b, v4.16b, v5.16b, #12
 ; CHECK-NEXT:    mov v2.16b, v6.16b
 ; CHECK-NEXT:    ret
-  %res = call <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 -9)
+  %res = call <16 x float> @llvm.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 -9)
   ret <16 x float> %res
 }
 
-declare <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8>, <2 x i8>, i32)
-declare <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8>, <16 x i8>, i32)
-declare <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32>, <8 x i32>, i32)
-declare <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float>, <16 x float>, i32)
-declare <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double>, <2 x double>, i32)
+declare <2 x i8> @llvm.vector.splice.v2i8(<2 x i8>, <2 x i8>, i32)
+declare <16 x i8> @llvm.vector.splice.v16i8(<16 x i8>, <16 x i8>, i32)
+declare <8 x i32> @llvm.vector.splice.v8i32(<8 x i32>, <8 x i32>, i32)
+declare <16 x float> @llvm.vector.splice.v16f32(<16 x float>, <16 x float>, i32)
+declare <2 x double> @llvm.vector.splice.v2f64(<2 x double>, <2 x double>, i32)
 
 attributes #0 = { nounwind "target-features"="+neon" }
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
index fac96e07de54..f5763cd61033 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
@@ -11,7 +11,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_zero_idx(<vscale x 16 x i8> %a, <vscal
 ; CHECK-LABEL: splice_nxv16i8_zero_idx:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 0)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 0)
   ret <vscale x 16 x i8> %res
 }
 
@@ -20,7 +20,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_first_idx(<vscale x 16 x i8> %a, <vsca
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 1)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 1)
   ret <vscale x 16 x i8> %res
 }
 
@@ -29,7 +29,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_last_idx(<vscale x 16 x i8> %a, <vscal
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #255
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 255)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 255)
   ret <vscale x 16 x i8> %res
 }
 
@@ -38,7 +38,7 @@ define <vscale x 8 x i16> @splice_nxv8i16_first_idx(<vscale x 8 x i16> %a, <vsca
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 1)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 1)
   ret <vscale x 8 x i16> %res
 }
 
@@ -47,7 +47,7 @@ define <vscale x 4 x i32> @splice_nxv4i32_first_idx(<vscale x 4 x i32> %a, <vsca
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 1)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 1)
   ret <vscale x 4 x i32> %res
 }
 
@@ -56,7 +56,7 @@ define <vscale x 4 x i32> @splice_nxv4i32_last_idx(<vscale x 4 x i32> %a, <vscal
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #252
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 63)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 63)
   ret <vscale x 4 x i32> %res
 }
 
@@ -65,7 +65,7 @@ define <vscale x 2 x i64> @splice_nxv2i64_first_idx(<vscale x 2 x i64> %a, <vsca
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 1)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 1)
   ret <vscale x 2 x i64> %res
 }
 
@@ -74,7 +74,7 @@ define <vscale x 2 x i64> @splice_nxv2i64_last_idx(<vscale x 2 x i64> %a, <vscal
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #248
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 31)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 31)
   ret <vscale x 2 x i64> %res
 }
 
@@ -85,7 +85,7 @@ define <vscale x 2 x half> @splice_nxv2f16_neg_idx(<vscale x 2 x half> %a, <vsca
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -1)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -1)
   ret <vscale x 2 x half> %res
 }
 
@@ -96,7 +96,7 @@ define <vscale x 2 x half> @splice_nxv2f16_neg2_idx(<vscale x 2 x half> %a, <vsc
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -2)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -2)
   ret <vscale x 2 x half> %res
 }
 
@@ -105,7 +105,7 @@ define <vscale x 2 x half> @splice_nxv2f16_first_idx(<vscale x 2 x half> %a, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 1)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 1)
   ret <vscale x 2 x half> %res
 }
 
@@ -114,7 +114,7 @@ define <vscale x 2 x half> @splice_nxv2f16_last_idx(<vscale x 2 x half> %a, <vsc
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #248
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 31)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 31)
   ret <vscale x 2 x half> %res
 }
 
@@ -125,7 +125,7 @@ define <vscale x 4 x half> @splice_nxv4f16_neg_idx(<vscale x 4 x half> %a, <vsca
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -1)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -1)
   ret <vscale x 4 x half> %res
 }
 
@@ -136,7 +136,7 @@ define <vscale x 4 x half> @splice_nxv4f16_neg3_idx(<vscale x 4 x half> %a, <vsc
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -3)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -3)
   ret <vscale x 4 x half> %res
 }
 
@@ -145,7 +145,7 @@ define <vscale x 4 x half> @splice_nxv4f16_first_idx(<vscale x 4 x half> %a, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 1)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 1)
   ret <vscale x 4 x half> %res
 }
 
@@ -154,7 +154,7 @@ define <vscale x 4 x half> @splice_nxv4f16_last_idx(<vscale x 4 x half> %a, <vsc
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #252
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 63)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 63)
   ret <vscale x 4 x half> %res
 }
 
@@ -163,7 +163,7 @@ define <vscale x 8 x half> @splice_nxv8f16_first_idx(<vscale x 8 x half> %a, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 1)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 1)
   ret <vscale x 8 x half> %res
 }
 
@@ -172,7 +172,7 @@ define <vscale x 8 x half> @splice_nxv8f16_last_idx(<vscale x 8 x half> %a, <vsc
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #254
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 127)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 127)
   ret <vscale x 8 x half> %res
 }
 
@@ -183,7 +183,7 @@ define <vscale x 2 x float> @splice_nxv2f32_neg_idx(<vscale x 2 x float> %a, <vs
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -1)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -1)
   ret <vscale x 2 x float> %res
 }
 
@@ -194,7 +194,7 @@ define <vscale x 2 x float> @splice_nxv2f32_neg2_idx(<vscale x 2 x float> %a, <v
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -2)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -2)
   ret <vscale x 2 x float> %res
 }
 
@@ -203,7 +203,7 @@ define <vscale x 2 x float> @splice_nxv2f32_first_idx(<vscale x 2 x float> %a, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 1)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 1)
   ret <vscale x 2 x float> %res
 }
 
@@ -212,7 +212,7 @@ define <vscale x 2 x float> @splice_nxv2f32_last_idx(<vscale x 2 x float> %a, <v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #248
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 31)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 31)
   ret <vscale x 2 x float> %res
 }
 
@@ -221,7 +221,7 @@ define <vscale x 4 x float> @splice_nxv4f32_first_idx(<vscale x 4 x float> %a, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 1)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 1)
   ret <vscale x 4 x float> %res
 }
 
@@ -230,7 +230,7 @@ define <vscale x 4 x float> @splice_nxv4f32_last_idx(<vscale x 4 x float> %a, <v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #252
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 63)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 63)
   ret <vscale x 4 x float> %res
 }
 
@@ -239,7 +239,7 @@ define <vscale x 2 x double> @splice_nxv2f64_first_idx(<vscale x 2 x double> %a,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 1)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 1)
   ret <vscale x 2 x double> %res
 }
 
@@ -248,7 +248,7 @@ define <vscale x 2 x double> @splice_nxv2f64_last_idx(<vscale x 2 x double> %a,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #248
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 31)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 31)
   ret <vscale x 2 x double> %res
 }
 
@@ -263,7 +263,7 @@ define <vscale x 2 x i1> @splice_nxv2i1_idx(<vscale x 2 x i1> %a, <vscale x 2 x
 ; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 1)
+  %res = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -278,7 +278,7 @@ define <vscale x 4 x i1> @splice_nxv4i1_idx(<vscale x 4 x i1> %a, <vscale x 4 x
 ; CHECK-NEXT:    and z1.s, z1.s, #0x1
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 2)
+  %res = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 2)
   ret <vscale x 4 x i1> %res
 }
 
@@ -293,7 +293,7 @@ define <vscale x 8 x i1> @splice_nxv8i1_idx(<vscale x 8 x i1> %a, <vscale x 8 x
 ; CHECK-NEXT:    and z1.h, z1.h, #0x1
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z1.h, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 4)
+  %res = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 4)
   ret <vscale x 8 x i1> %res
 }
 
@@ -308,7 +308,7 @@ define <vscale x 16 x i1> @splice_nxv16i1_idx(<vscale x 16 x i1> %a, <vscale x 1
 ; CHECK-NEXT:    and z1.b, z1.b, #0x1
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 8)
+  %res = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 8)
   ret <vscale x 16 x i1> %res
 }
 
@@ -318,7 +318,7 @@ define <vscale x 2 x i8> @splice_nxv2i8_idx(<vscale x 2 x i8> %a, <vscale x 2 x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 1)
+  %res = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 1)
   ret <vscale x 2 x i8> %res
 }
 
@@ -340,7 +340,7 @@ define <vscale x 8 x i32> @splice_nxv8i32_idx(<vscale x 8 x i32> %a, <vscale x 8
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 2)
+  %res = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 2)
   ret <vscale x 8 x i32> %res
 }
 
@@ -373,7 +373,7 @@ define <vscale x 16 x float> @splice_nxv16f32_16(<vscale x 16 x float> %a, <vsca
 ; CHECK-NEXT:    addvl sp, sp, #8
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 16)
+  %res = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 16)
   ret <vscale x 16 x float> %res
 }
 
@@ -388,7 +388,7 @@ define <vscale x 16 x i8> @splice_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -16)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -16)
   ret <vscale x 16 x i8> %res
 }
 
@@ -399,7 +399,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg32(<vscale x 16 x i8> %a, <vscale x
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -32)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -32)
   ret <vscale x 16 x i8> %res
 }
 
@@ -410,7 +410,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg64(<vscale x 16 x i8> %a, <vscale x
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -64)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -64)
   ret <vscale x 16 x i8> %res
 }
 
@@ -421,7 +421,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg128(<vscale x 16 x i8> %a, <vscale
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -128)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -128)
   ret <vscale x 16 x i8> %res
 }
 
@@ -432,7 +432,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg256(<vscale x 16 x i8> %a, <vscale
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -256)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -256)
   ret <vscale x 16 x i8> %res
 }
 
@@ -443,7 +443,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_1(<vscale x 16 x i8> %a, <vscale x 16
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -1)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -1)
   ret <vscale x 16 x i8> %res
 }
 
@@ -466,7 +466,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg17(<vscale x 16 x i8> %a, <vscale x
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -17)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -17)
   ret <vscale x 16 x i8> %res
 }
 
@@ -477,7 +477,7 @@ define <vscale x 8 x i16> @splice_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -8)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -8)
   ret <vscale x 8 x i16> %res
 }
 
@@ -488,7 +488,7 @@ define <vscale x 8 x i16> @splice_nxv8i16_1(<vscale x 8 x i16> %a, <vscale x 8 x
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -1)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -1)
   ret <vscale x 8 x i16> %res
 }
 
@@ -511,7 +511,7 @@ define <vscale x 8 x i16> @splice_nxv8i16_neg9(<vscale x 8 x i16> %a, <vscale x
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -9)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -9)
   ret <vscale x 8 x i16> %res
 }
 
@@ -522,7 +522,7 @@ define <vscale x 4 x i32> @splice_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -4)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -4)
   ret <vscale x 4 x i32> %res
 }
 
@@ -533,7 +533,7 @@ define <vscale x 4 x i32> @splice_nxv4i32_1(<vscale x 4 x i32> %a, <vscale x 4 x
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -1)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -1)
   ret <vscale x 4 x i32> %res
 }
 
@@ -544,7 +544,7 @@ define <vscale x 4 x i32> @splice_nxv4i32_neg5(<vscale x 4 x i32> %a, <vscale x
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -5)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -5)
   ret <vscale x 4 x i32> %res
 }
 
@@ -555,7 +555,7 @@ define <vscale x 2 x i64> @splice_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -2)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -2)
   ret <vscale x 2 x i64> %res
 }
 
@@ -566,7 +566,7 @@ define <vscale x 2 x i64> @splice_nxv2i64_1(<vscale x 2 x i64> %a, <vscale x 2 x
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -1)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -1)
   ret <vscale x 2 x i64> %res
 }
 
@@ -577,7 +577,7 @@ define <vscale x 2 x i64> @splice_nxv2i64_neg3(<vscale x 2 x i64> %a, <vscale x
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -3)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -3)
   ret <vscale x 2 x i64> %res
 }
 
@@ -588,7 +588,7 @@ define <vscale x 8 x half> @splice_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -8)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -8)
   ret <vscale x 8 x half> %res
 }
 
@@ -599,7 +599,7 @@ define <vscale x 8 x half> @splice_nxv8f16_1(<vscale x 8 x half> %a, <vscale x 8
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -1)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -1)
   ret <vscale x 8 x half> %res
 }
 
@@ -622,7 +622,7 @@ define <vscale x 8 x half> @splice_nxv8f16_neg9(<vscale x 8 x half> %a, <vscale
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -9)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -9)
   ret <vscale x 8 x half> %res
 }
 
@@ -633,7 +633,7 @@ define <vscale x 4 x float> @splice_nxv4f32(<vscale x 4 x float> %a, <vscale x 4
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -4)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -4)
   ret <vscale x 4 x float> %res
 }
 
@@ -644,7 +644,7 @@ define <vscale x 4 x float> @splice_nxv4f32_1(<vscale x 4 x float> %a, <vscale x
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -1)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -1)
   ret <vscale x 4 x float> %res
 }
 
@@ -655,7 +655,7 @@ define <vscale x 4 x float> @splice_nxv4f32_neg5(<vscale x 4 x float> %a, <vscal
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -5)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -5)
   ret <vscale x 4 x float> %res
 }
 
@@ -666,7 +666,7 @@ define <vscale x 2 x double> @splice_nxv2f64(<vscale x 2 x double> %a, <vscale x
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -2)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -2)
   ret <vscale x 2 x double> %res
 }
 
@@ -677,7 +677,7 @@ define <vscale x 2 x double> @splice_nxv2f64_1(<vscale x 2 x double> %a, <vscale
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -1)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -1)
   ret <vscale x 2 x double> %res
 }
 
@@ -688,7 +688,7 @@ define <vscale x 2 x double> @splice_nxv2f64_neg3(<vscale x 2 x double> %a, <vsc
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -3)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -3)
   ret <vscale x 2 x double> %res
 }
 
@@ -705,7 +705,7 @@ define <vscale x 2 x i1> @splice_nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1>
 ; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 -1)
+  %res = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 -1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -722,7 +722,7 @@ define <vscale x 4 x i1> @splice_nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1>
 ; CHECK-NEXT:    and z1.s, z1.s, #0x1
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 -1)
+  %res = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 -1)
   ret <vscale x 4 x i1> %res
 }
 
@@ -739,7 +739,7 @@ define <vscale x 8 x i1> @splice_nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1>
 ; CHECK-NEXT:    and z1.h, z1.h, #0x1
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z1.h, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 -1)
+  %res = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 -1)
   ret <vscale x 8 x i1> %res
 }
 
@@ -756,7 +756,7 @@ define <vscale x 16 x i1> @splice_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x
 ; CHECK-NEXT:    and z1.b, z1.b, #0x1
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 -1)
+  %res = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 -1)
   ret <vscale x 16 x i1> %res
 }
 
@@ -768,7 +768,7 @@ define <vscale x 2 x i8> @splice_nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8>
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 -2)
+  %res = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 -2)
   ret <vscale x 2 x i8> %res
 }
 
@@ -793,7 +793,7 @@ define <vscale x 8 x i32> @splice_nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 -8)
+  %res = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 -8)
   ret <vscale x 8 x i32> %res
 }
 
@@ -826,26 +826,26 @@ define <vscale x 16 x float> @splice_nxv16f32_neg17(<vscale x 16 x float> %a, <v
 ; CHECK-NEXT:    addvl sp, sp, #8
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 -17)
+  %res = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 -17)
   ret <vscale x 16 x float> %res
 }
 
-declare <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
-declare <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
-declare <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
-declare <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
-declare <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
-declare <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
-declare <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
-declare <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
-declare <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
-declare <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
-declare <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
-declare <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
-declare <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
-declare <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
-declare <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
-declare <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
-declare <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
+declare <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
+declare <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
+declare <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
+declare <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
+declare <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
+declare <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+declare <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
+declare <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
+declare <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
+declare <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
+declare <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
+declare <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
+declare <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
+declare <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
+declare <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
 
 attributes #0 = { nounwind "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
index 9920bc6048e8..478f4a689d3c 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
@@ -9,7 +9,7 @@ define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_n
 ; CHECK-NEXT:    uzp1 z0.d, z2.d, z1.d
 ; CHECK-NEXT:    uzp2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
+  %retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
   ret {<vscale x 2 x half>, <vscale x 2 x half>}   %retval
 }
 
@@ -21,7 +21,7 @@ define {<vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_n
 ; CHECK-NEXT:    uzp1 z0.s, z2.s, z1.s
 ; CHECK-NEXT:    uzp2 z1.s, z2.s, z1.s
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
+  %retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
   ret {<vscale x 4 x half>, <vscale x 4 x half>}   %retval
 }
 
@@ -32,7 +32,7 @@ define {<vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_n
 ; CHECK-NEXT:    uzp2 z1.h, z0.h, z1.h
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %vec)
+  %retval = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %vec)
   ret {<vscale x 8 x half>, <vscale x 8 x half>}   %retval
 }
 
@@ -44,7 +44,7 @@ define {<vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32
 ; CHECK-NEXT:    uzp1 z0.d, z2.d, z1.d
 ; CHECK-NEXT:    uzp2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
+  %retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
   ret {<vscale x 2 x float>, <vscale x 2 x float>}   %retval
 }
 
@@ -55,7 +55,7 @@ define {<vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32
 ; CHECK-NEXT:    uzp2 z1.s, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %vec)
+  %retval = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %vec)
 ret  {<vscale x 4 x float>, <vscale x 4 x float>}   %retval
 }
 
@@ -66,7 +66,7 @@ define {<vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f
 ; CHECK-NEXT:    uzp2 z1.d, z0.d, z1.d
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
+  %retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
   ret {<vscale x 2 x double>, <vscale x 2 x double>}   %retval
 }
 
@@ -79,7 +79,7 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv
 ; CHECK-NEXT:    uzp2 z1.b, z0.b, z1.b
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
+  %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
   ret {<vscale x 16 x i8>, <vscale x 16 x i8>}   %retval
 }
 
@@ -90,7 +90,7 @@ define {<vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv
 ; CHECK-NEXT:    uzp2 z1.h, z0.h, z1.h
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
+  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
   ret {<vscale x 8 x i16>, <vscale x 8 x i16>}   %retval
 }
 
@@ -101,7 +101,7 @@ define {<vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxv
 ; CHECK-NEXT:    uzp2 z1.s, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
+  %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
   ret {<vscale x 4 x i32>, <vscale x 4 x i32>}   %retval
 }
 
@@ -112,7 +112,7 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv
 ; CHECK-NEXT:    uzp2 z1.d, z0.d, z1.d
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
+  %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
   ret {<vscale x 2 x i64>, <vscale x 2 x i64>}   %retval
 }
 
@@ -124,7 +124,7 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv
 ; CHECK-NEXT:    uzp2 p1.b, p0.b, p1.b
 ; CHECK-NEXT:    mov p0.b, p2.b
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
+  %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
   ret {<vscale x 16 x i1>, <vscale x 16 x i1>}   %retval
 }
 
@@ -136,7 +136,7 @@ define {<vscale x 8 x i1>, <vscale x 8 x i1>} @vector_deinterleave_nxv8i1_nxv16i
 ; CHECK-NEXT:    uzp1 p0.h, p2.h, p1.h
 ; CHECK-NEXT:    uzp2 p1.h, p2.h, p1.h
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.experimental.vector.deinterleave2.nxv16i1(<vscale x 16 x i1> %vec)
+  %retval = call {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.vector.deinterleave2.nxv16i1(<vscale x 16 x i1> %vec)
   ret {<vscale x 8 x i1>, <vscale x 8 x i1>}   %retval
 }
 
@@ -148,7 +148,7 @@ define {<vscale x 4 x i1>, <vscale x 4 x i1>} @vector_deinterleave_nxv4i1_nxv8i1
 ; CHECK-NEXT:    uzp1 p0.s, p2.s, p1.s
 ; CHECK-NEXT:    uzp2 p1.s, p2.s, p1.s
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.experimental.vector.deinterleave2.nxv8i1(<vscale x 8 x i1> %vec)
+  %retval = call {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.vector.deinterleave2.nxv8i1(<vscale x 8 x i1> %vec)
   ret {<vscale x 4 x i1>, <vscale x 4 x i1>}   %retval
 }
 
@@ -160,7 +160,7 @@ define {<vscale x 2 x i1>, <vscale x 2 x i1>} @vector_deinterleave_nxv2i1_nxv4i1
 ; CHECK-NEXT:    uzp1 p0.d, p2.d, p1.d
 ; CHECK-NEXT:    uzp2 p1.d, p2.d, p1.d
 ; CHECK-NEXT:    ret
-  %retval = call {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.experimental.vector.deinterleave2.nxv4i1(<vscale x 4 x i1> %vec)
+  %retval = call {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.vector.deinterleave2.nxv4i1(<vscale x 4 x i1> %vec)
   ret {<vscale x 2 x i1>, <vscale x 2 x i1>}   %retval
 }
 
@@ -178,7 +178,7 @@ define {<vscale x 4 x i64>, <vscale x 4 x i64>} @vector_deinterleave_nxv4i64_nxv
 ; CHECK-NEXT:    mov z1.d, z4.d
 ; CHECK-NEXT:    mov z2.d, z6.d
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %vec)
+%retval = call {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %vec)
 ret {<vscale x 4 x i64>, <vscale x 4 x i64>} %retval
 }
 
@@ -201,7 +201,7 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>}  @vector_deinterleave_nxv8i64_nx
 ; CHECK-NEXT:    mov z5.d, z29.d
 ; CHECK-NEXT:    mov z6.d, z30.d
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
+%retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
 ret {<vscale x 8 x i64>, <vscale x 8 x i64>}  %retval
 }
 
@@ -216,7 +216,7 @@ define {<vscale x 8 x i8>, <vscale x 8 x i8>} @vector_deinterleave_nxv8i8_nxv16i
 ; CHECK-NEXT:    uzp1 z0.h, z2.h, z1.h
 ; CHECK-NEXT:    uzp2 z1.h, z2.h, z1.h
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.experimental.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %vec)
+%retval = call {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %vec)
 ret {<vscale x 8 x i8>, <vscale x 8 x i8>} %retval
 }
 
@@ -228,7 +228,7 @@ define {<vscale x 4 x i16>, <vscale x 4 x i16>} @vector_deinterleave_nxv4i16_nxv
 ; CHECK-NEXT:    uzp1 z0.s, z2.s, z1.s
 ; CHECK-NEXT:    uzp2 z1.s, z2.s, z1.s
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 4 x i16>, <vscale x 4 x i16>} @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %vec)
+%retval = call {<vscale x 4 x i16>, <vscale x 4 x i16>} @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %vec)
 ret {<vscale x 4 x i16>, <vscale x 4 x i16>} %retval
 }
 
@@ -240,35 +240,35 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @vector_deinterleave_nxv2i32_nxv
 ; CHECK-NEXT:    uzp1 z0.d, z2.d, z1.d
 ; CHECK-NEXT:    uzp2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 2 x i32>,<vscale x 2 x i32>} @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %vec)
+%retval = call {<vscale x 2 x i32>,<vscale x 2 x i32>} @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %vec)
 ret {<vscale x 2 x i32>, <vscale x 2 x i32>} %retval
 }
 
 
 ; Floating declarations
-declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
-declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
-declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
-declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
-declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
-declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
+declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
+declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
+declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
+declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
+declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
 
 ; Integer declarations
-declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
-declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
-declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
+declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
+declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
 
 ; Predicated declarations
-declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
-declare {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.experimental.vector.deinterleave2.nxv16i1(<vscale x 16 x i1>)
-declare {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.experimental.vector.deinterleave2.nxv8i1(<vscale x 8 x i1>)
-declare {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.experimental.vector.deinterleave2.nxv4i1(<vscale x 4 x i1>)
+declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
+declare {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.vector.deinterleave2.nxv16i1(<vscale x 16 x i1>)
+declare {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.vector.deinterleave2.nxv8i1(<vscale x 8 x i1>)
+declare {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.vector.deinterleave2.nxv4i1(<vscale x 4 x i1>)
 
 ; Illegal size type
-declare {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
-declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
+declare {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
+declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
 
-declare {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.experimental.vector.deinterleave2.nxv16i8(<vscale x 16 x i8>)
-declare {<vscale x 4 x i16>, <vscale x 4 x i16>} @llvm.experimental.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
-declare {<vscale x 2 x i32>, <vscale x 2 x i32>} @llvm.experimental.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
+declare {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8>)
+declare {<vscale x 4 x i16>, <vscale x 4 x i16>} @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
+declare {<vscale x 2 x i32>, <vscale x 2 x i32>} @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
index 23bf5065286e..e2c3b0abe21a 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
@@ -8,7 +8,7 @@ define <vscale x 4 x half> @interleave2_nxv4f16(<vscale x 2 x half> %vec0, <vsca
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %vec0, <vscale x 2 x half> %vec1)
+  %retval = call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %vec0, <vscale x 2 x half> %vec1)
   ret  <vscale x 4 x half>   %retval
 }
 
@@ -19,7 +19,7 @@ define <vscale x 8 x half> @interleave2_nxv8f16(<vscale x 4 x half> %vec0, <vsca
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half> %vec0, <vscale x 4 x half> %vec1)
+  %retval = call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %vec0, <vscale x 4 x half> %vec1)
   ret  <vscale x 8 x half>   %retval
 }
 
@@ -30,7 +30,7 @@ define <vscale x 16 x half> @interleave2_nxv16f16(<vscale x 8 x half> %vec0, <vs
 ; CHECK-NEXT:    zip2 z1.h, z0.h, z1.h
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half> %vec0, <vscale x 8 x half> %vec1)
+  %retval = call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %vec0, <vscale x 8 x half> %vec1)
   ret  <vscale x 16 x half>   %retval
 }
 
@@ -41,7 +41,7 @@ define <vscale x 4 x float> @interleave2_nxv4f32(<vscale x 2 x float> %vec0, <vs
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float> %vec0, <vscale x 2 x float> %vec1)
+  %retval = call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %vec0, <vscale x 2 x float> %vec1)
   ret  <vscale x 4 x float>   %retval
 }
 
@@ -52,7 +52,7 @@ define <vscale x 8 x float> @interleave2_nxv8f32(<vscale x 4 x float> %vec0, <vs
 ; CHECK-NEXT:    zip2 z1.s, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1)
+  %retval = call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1)
   ret  <vscale x 8 x float>   %retval
 }
 
@@ -63,7 +63,7 @@ define <vscale x 4 x double> @interleave2_nxv4f64(<vscale x 2 x double> %vec0, <
 ; CHECK-NEXT:    zip2 z1.d, z0.d, z1.d
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 4 x double>@llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %vec0, <vscale x 2 x double> %vec1)
+  %retval = call <vscale x 4 x double>@llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %vec0, <vscale x 2 x double> %vec1)
   ret  <vscale x 4 x double>   %retval
 }
 
@@ -76,7 +76,7 @@ define <vscale x 32 x i8> @interleave2_nxv32i8(<vscale x 16 x i8> %vec0, <vscale
 ; CHECK-NEXT:    zip2 z1.b, z0.b, z1.b
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-   %retval = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1)
+   %retval = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1)
   ret  <vscale x 32 x i8>   %retval
 }
 
@@ -87,7 +87,7 @@ define <vscale x 16 x i16> @interleave2_nxv16i16(<vscale x 8 x i16> %vec0, <vsca
 ; CHECK-NEXT:    zip2 z1.h, z0.h, z1.h
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1)
+  %retval = call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1)
   ret  <vscale x 16 x i16>   %retval
 }
 
@@ -98,7 +98,7 @@ define <vscale x 8 x i32> @interleave2_nxv8i32(<vscale x 4 x i32> %vec0, <vscale
 ; CHECK-NEXT:    zip2 z1.s, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1)
+  %retval = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1)
   ret  <vscale x 8 x i32>   %retval
 }
 
@@ -109,7 +109,7 @@ define <vscale x 4 x i64> @interleave2_nxv4i64(<vscale x 2 x i64> %vec0, <vscale
 ; CHECK-NEXT:    zip2 z1.d, z0.d, z1.d
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1)
+  %retval = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1)
   ret  <vscale x 4 x i64>   %retval
 }
 
@@ -122,7 +122,7 @@ define <vscale x 32 x i1> @interleave2_nxv32i1(<vscale x 16 x i1> %vec0, <vscale
 ; CHECK-NEXT:    zip2 p1.b, p0.b, p1.b
 ; CHECK-NEXT:    mov p0.b, p2.b
 ; CHECK-NEXT:    ret
-   %retval = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> %vec0, <vscale x 16 x i1> %vec1)
+   %retval = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %vec0, <vscale x 16 x i1> %vec1)
   ret  <vscale x 32 x i1>   %retval
 }
 
@@ -133,7 +133,7 @@ define <vscale x 16 x i1> @interleave2_nxv16i1(<vscale x 8 x i1> %vec0, <vscale
 ; CHECK-NEXT:    zip1 p0.h, p0.h, p1.h
 ; CHECK-NEXT:    uzp1 p0.b, p0.b, p2.b
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 16 x i1> @llvm.experimental.vector.interleave2.nxv16i1(<vscale x 8 x i1> %vec0, <vscale x 8 x i1> %vec1)
+  %retval = call <vscale x 16 x i1> @llvm.vector.interleave2.nxv16i1(<vscale x 8 x i1> %vec0, <vscale x 8 x i1> %vec1)
   ret  <vscale x 16 x i1>   %retval
 }
 
@@ -144,7 +144,7 @@ define <vscale x 8 x i1> @interleave2_nxv8i1(<vscale x 4 x i1> %vec0, <vscale x
 ; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
 ; CHECK-NEXT:    uzp1 p0.h, p0.h, p2.h
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 8 x i1> @llvm.experimental.vector.interleave2.nxv8i1(<vscale x 4 x i1> %vec0, <vscale x 4 x i1> %vec1)
+  %retval = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> %vec0, <vscale x 4 x i1> %vec1)
   ret  <vscale x 8 x i1>   %retval
 }
 
@@ -155,7 +155,7 @@ define <vscale x 4 x i1> @interleave2_nxv4i1(<vscale x 2 x i1> %vec0, <vscale x
 ; CHECK-NEXT:    zip1 p0.d, p0.d, p1.d
 ; CHECK-NEXT:    uzp1 p0.s, p0.s, p2.s
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1> %vec0, <vscale x 2 x i1> %vec1)
+  %retval = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %vec0, <vscale x 2 x i1> %vec1)
   ret  <vscale x 4 x i1>   %retval
 }
 
@@ -172,7 +172,7 @@ define <vscale x 16 x i32> @interleave2_nxv16i32(<vscale x 8 x i32> %vec0, <vsca
 ; CHECK-NEXT:    mov z1.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z4.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 16 x i32>@llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %vec0, <vscale x 8 x i32> %vec1)
+  %retval = call <vscale x 16 x i32>@llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %vec0, <vscale x 8 x i32> %vec1)
   ret <vscale x 16 x i32> %retval
 }
 
@@ -187,7 +187,7 @@ define <vscale x 8 x i64> @interleave2_nxv8i64(<vscale x 4 x i64> %vec0, <vscale
 ; CHECK-NEXT:    mov z1.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z4.d
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64> %vec0, <vscale x 4 x i64> %vec1)
+  %retval = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> %vec0, <vscale x 4 x i64> %vec1)
   ret <vscale x 8 x i64> %retval
 }
 
@@ -200,7 +200,7 @@ define <vscale x 16 x i8> @interleave2_nxv8i8(<vscale x 8 x i8> %vec0, <vscale x
 ; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z2.b
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 16 x i8> @llvm.experimental.vector.interleave2.nxv16i8(<vscale x 8 x i8> %vec0, <vscale x 8 x i8> %vec1)
+  %retval = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 8 x i8> %vec0, <vscale x 8 x i8> %vec1)
   ret <vscale x 16 x i8> %retval
 }
 
@@ -211,7 +211,7 @@ define <vscale x 8 x i16> @interleave2_nxv4i16(<vscale x 4 x i16> %vec0, <vscale
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16> %vec0, <vscale x 4 x i16> %vec1)
+  %retval = call <vscale x 8 x i16> @llvm.vector.interleave2.nxv8i16(<vscale x 4 x i16> %vec0, <vscale x 4 x i16> %vec1)
   ret <vscale x 8 x i16> %retval
 }
 
@@ -222,34 +222,34 @@ define <vscale x 4 x i32> @interleave2_nxv2i32(<vscale x 2 x i32> %vec0, <vscale
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32> %vec0, <vscale x 2 x i32> %vec1)
+  %retval = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %vec0, <vscale x 2 x i32> %vec1)
   ret <vscale x 4 x i32> %retval
 }
 
 ; Float declarations
-declare <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
-declare <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
+declare <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
+declare <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
+declare <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
 
 ; Integer declarations
-declare <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
 
 ; Predicated
-declare <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
-declare <vscale x 16 x i1> @llvm.experimental.vector.interleave2.nxv16i1(<vscale x 8 x i1>, <vscale x 8 x i1>)
-declare <vscale x 8 x i1> @llvm.experimental.vector.interleave2.nxv8i1(<vscale x 4 x i1>, <vscale x 4 x i1>)
-declare <vscale x 4 x i1> @llvm.experimental.vector.interleave2.nxv4i1(<vscale x 2 x i1>, <vscale x 2 x i1>)
+declare <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 16 x i1> @llvm.vector.interleave2.nxv16i1(<vscale x 8 x i1>, <vscale x 8 x i1>)
+declare <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1>, <vscale x 4 x i1>)
+declare <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1>, <vscale x 2 x i1>)
 
 ; Illegal type size
-declare <vscale x 16 x i32> @llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32>, <vscale x 8 x i32>)
-declare <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
+declare <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32>, <vscale x 8 x i32>)
+declare <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
 
-declare <vscale x 16 x i8> @llvm.experimental.vector.interleave2.nxv16i8(<vscale x 8 x i8>, <vscale x 8 x i8>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.interleave2.nxv8i16(<vscale x 4 x i16>, <vscale x 4 x i16>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.interleave2.nxv4i32(<vscale x 2 x i32>, <vscale x 2 x i32>)
+declare <vscale x 16 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 8 x i8>, <vscale x 8 x i8>)
+declare <vscale x 8 x i16> @llvm.vector.interleave2.nxv8i16(<vscale x 4 x i16>, <vscale x 4 x i16>)
+declare <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32>, <vscale x 2 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
index 23b2622f5f58..ef7d4abe5c5f 100644
--- a/llvm/test/CodeGen/AArch64/sve2-bsl.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -41,3 +41,55 @@ define <vscale x 4 x i32> @no_bsl_fold(<vscale x 4 x i32> %a, <vscale x 4 x i32>
   %c = or <vscale x 4 x i32> %1, %2
   ret <vscale x 4 x i32> %c
 }
+
+define <vscale x 16 x i8> @nbsl_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: nbsl_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.b, #127 // =0x7f
+; CHECK-NEXT:    nbsl z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT:    ret
+  %1 = and <vscale x 16 x i8> %a, splat(i8 127)
+  %2 = and <vscale x 16 x i8> %b, splat(i8 -128)
+  %3 = or <vscale x 16 x i8> %1, %2
+  %4 = xor <vscale x 16 x i8> %3, splat(i8 -1)
+  ret <vscale x 16 x i8> %4
+}
+
+define <vscale x 8 x i16> @nbsl_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: nbsl_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.h, #32767 // =0x7fff
+; CHECK-NEXT:    nbsl z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT:    ret
+  %1 = and <vscale x 8 x i16> %a, splat(i16 32767)
+  %2 = and <vscale x 8 x i16> %b, splat(i16 -32768)
+  %3 = or <vscale x 8 x i16> %1, %2
+  %4 = xor <vscale x 8 x i16> %3, splat(i16 -1)
+  ret <vscale x 8 x i16> %4
+}
+
+define <vscale x 4 x i32> @nbsl_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: nbsl_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.s, #0x7fffffff
+; CHECK-NEXT:    nbsl z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT:    ret
+  %1 = and <vscale x 4 x i32> %a, splat(i32 2147483647)
+  %2 = and <vscale x 4 x i32> %b, splat(i32 -2147483648)
+  %3 = or <vscale x 4 x i32> %1, %2
+  %4 = xor <vscale x 4 x i32> %3, splat(i32 -1)
+  ret <vscale x 4 x i32> %4
+}
+
+define <vscale x 2 x i64> @nbsl_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: nbsl_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
+; CHECK-NEXT:    nbsl z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT:    ret
+  %1 = and <vscale x 2 x i64> %a, splat(i64 9223372036854775807)
+  %2 = and <vscale x 2 x i64> %b, splat(i64 -9223372036854775808)
+  %3 = or <vscale x 2 x i64> %1, %2
+  %4 = xor <vscale x 2 x i64> %3, splat(i64 -1)
+  ret <vscale x 2 x i64> %4
+}
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-while-reversed.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-while-reversed.ll
index cb74cd8032ab..5f7476397891 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-while-reversed.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-while-reversed.ll
@@ -16,7 +16,7 @@ define <vscale x 16 x i1> @whilege_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilele.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -28,7 +28,7 @@ define <vscale x 16 x i1> @whilege_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilele.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -40,7 +40,7 @@ define <vscale x 8 x i1> @whilege_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilele.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -52,7 +52,7 @@ define <vscale x 8 x i1> @whilege_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilele.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -64,7 +64,7 @@ define <vscale x 4 x i1> @whilege_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilele.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -76,7 +76,7 @@ define <vscale x 4 x i1> @whilege_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilele.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -88,7 +88,7 @@ define <vscale x 2 x i1> @whilege_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilele.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -100,7 +100,7 @@ define <vscale x 2 x i1> @whilege_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilele.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -116,7 +116,7 @@ define <vscale x 16 x i1> @whilehs_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilels.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -128,7 +128,7 @@ define <vscale x 16 x i1> @whilehs_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilels.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -140,7 +140,7 @@ define <vscale x 8 x i1> @whilehs_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilels.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -152,7 +152,7 @@ define <vscale x 8 x i1> @whilehs_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilels.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -164,7 +164,7 @@ define <vscale x 4 x i1> @whilehs_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilels.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -176,7 +176,7 @@ define <vscale x 4 x i1> @whilehs_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilels.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -188,7 +188,7 @@ define <vscale x 2 x i1> @whilehs_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilels.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -200,7 +200,7 @@ define <vscale x 2 x i1> @whilehs_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilels.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -214,7 +214,7 @@ define <vscale x 16 x i1> @whilegt_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilegt p0.b, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelt.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -224,7 +224,7 @@ define <vscale x 16 x i1> @whilegt_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilegt p0.b, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelt.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -234,7 +234,7 @@ define <vscale x 8 x i1> @whilegt_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilegt p0.h, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelt.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -244,7 +244,7 @@ define <vscale x 8 x i1> @whilegt_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilegt p0.h, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelt.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -254,7 +254,7 @@ define <vscale x 4 x i1> @whilegt_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilegt p0.s, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -264,7 +264,7 @@ define <vscale x 4 x i1> @whilegt_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilegt p0.s, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -274,7 +274,7 @@ define <vscale x 2 x i1> @whilegt_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilegt p0.d, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -284,7 +284,7 @@ define <vscale x 2 x i1> @whilegt_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilegt p0.d, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -298,7 +298,7 @@ define <vscale x 16 x i1> @whilehi_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilehi p0.b, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -308,7 +308,7 @@ define <vscale x 16 x i1> @whilehi_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilehi p0.b, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -318,7 +318,7 @@ define <vscale x 8 x i1> @whilehi_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilehi p0.h, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelo.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -328,7 +328,7 @@ define <vscale x 8 x i1> @whilehi_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilehi p0.h, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelo.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -338,7 +338,7 @@ define <vscale x 4 x i1> @whilehi_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilehi p0.s, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilelo.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -348,7 +348,7 @@ define <vscale x 4 x i1> @whilehi_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilehi p0.s, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilelo.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -358,7 +358,7 @@ define <vscale x 2 x i1> @whilehi_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilehi p0.d, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelo.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -368,7 +368,7 @@ define <vscale x 2 x i1> @whilehi_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilehi p0.d, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelo.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -384,7 +384,7 @@ define <vscale x 16 x i1> @whilele_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilege.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -396,7 +396,7 @@ define <vscale x 16 x i1> @whilele_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilege.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -408,7 +408,7 @@ define <vscale x 8 x i1> @whilele_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilege.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -420,7 +420,7 @@ define <vscale x 8 x i1> @whilele_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilege.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -432,7 +432,7 @@ define <vscale x 4 x i1> @whilele_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilege.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -444,7 +444,7 @@ define <vscale x 4 x i1> @whilele_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilege.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -456,7 +456,7 @@ define <vscale x 2 x i1> @whilele_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilege.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -468,7 +468,7 @@ define <vscale x 2 x i1> @whilele_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilege.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -482,7 +482,7 @@ define <vscale x 16 x i1> @whilelo_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelo p0.b, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilehi.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -492,7 +492,7 @@ define <vscale x 16 x i1> @whilelo_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelo p0.b, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilehi.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -502,7 +502,7 @@ define <vscale x 8 x i1> @whilelo_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelo p0.h, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilehi.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -512,7 +512,7 @@ define <vscale x 8 x i1> @whilelo_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelo p0.h, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilehi.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -522,7 +522,7 @@ define <vscale x 4 x i1> @whilelo_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelo p0.s, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilehi.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -532,7 +532,7 @@ define <vscale x 4 x i1> @whilelo_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelo p0.s, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilehi.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -542,7 +542,7 @@ define <vscale x 2 x i1> @whilelo_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelo p0.d, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilehi.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -552,7 +552,7 @@ define <vscale x 2 x i1> @whilelo_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelo p0.d, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilehi.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -568,7 +568,7 @@ define <vscale x 16 x i1> @whilels_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilehs.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -580,7 +580,7 @@ define <vscale x 16 x i1> @whilels_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.b, p0.b
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilehs.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -592,7 +592,7 @@ define <vscale x 8 x i1> @whilels_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilehs.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -604,7 +604,7 @@ define <vscale x 8 x i1> @whilels_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.h, p0.h
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilehs.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -616,7 +616,7 @@ define <vscale x 4 x i1> @whilels_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilehs.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -628,7 +628,7 @@ define <vscale x 4 x i1> @whilels_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.s, p0.s
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilehs.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -640,7 +640,7 @@ define <vscale x 2 x i1> @whilels_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilehs.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -652,7 +652,7 @@ define <vscale x 2 x i1> @whilels_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    rev p0.d, p0.d
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilehs.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -666,7 +666,7 @@ define <vscale x 16 x i1> @whilelt_b_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelt p0.b, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilegt.nxv16i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -676,7 +676,7 @@ define <vscale x 16 x i1> @whilelt_b_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelt p0.b, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 16 x i1> @llvm.aarch64.sve.whilegt.nxv16i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
+  %while.rev = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %while)
   ret <vscale x 16 x i1> %while.rev
 }
 
@@ -686,7 +686,7 @@ define <vscale x 8 x i1> @whilelt_h_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelt p0.h, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilegt.nxv8i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -696,7 +696,7 @@ define <vscale x 8 x i1> @whilelt_h_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelt p0.h, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 8 x i1> @llvm.aarch64.sve.whilegt.nxv8i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
+  %while.rev = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %while)
   ret <vscale x 8 x i1> %while.rev
 }
 
@@ -706,7 +706,7 @@ define <vscale x 4 x i1> @whilelt_s_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelt p0.s, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilegt.nxv4i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -716,7 +716,7 @@ define <vscale x 4 x i1> @whilelt_s_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelt p0.s, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 4 x i1> @llvm.aarch64.sve.whilegt.nxv4i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
+  %while.rev = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %while)
   ret <vscale x 4 x i1> %while.rev
 }
 
@@ -726,7 +726,7 @@ define <vscale x 2 x i1> @whilelt_d_ww(i32 %a, i32 %b) {
 ; CHECK-NEXT:    whilelt p0.d, w0, w1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilegt.nxv2i1.i32(i32 %b, i32 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
 
@@ -736,6 +736,6 @@ define <vscale x 2 x i1> @whilelt_d_xx(i64 %a, i64 %b) {
 ; CHECK-NEXT:    whilelt p0.d, x0, x1
 ; CHECK-NEXT:    ret
   %while = call <vscale x 2 x i1> @llvm.aarch64.sve.whilegt.nxv2i1.i64(i64 %b, i64 %a)
-  %while.rev = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
+  %while.rev = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %while)
   ret <vscale x 2 x i1> %while.rev
 }
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index b7b2cb22c1b6..9d4f9434aa31 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -142,8 +142,8 @@ attributes #0 = { nounwind }
 
 ; GCN: amdpal.pipelines:
 ; GCN-NEXT:  - .registers:
-; GCN-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}}
-; GCN-NEXT:      0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}}
+; GCN-NEXT:      '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
+; GCN-NEXT:      '0x2e13 (COMPUTE_PGM_RSRC2)': 0x8001{{$}}
 ; GCN-NEXT:    .shader_functions:
 ; GCN-NEXT:      dynamic_stack:
 ; GCN-NEXT:        .backend_stack_size: 0x10{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll
index 98aa04f6d26e..a3fd2a942bc2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-cs.ll
@@ -11,8 +11,8 @@
 ; GCN-NEXT:         .entry_point:    cs_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
-; GCN-NEXT:       0x2e12 (COMPUTE_PGM_RSRC1):
-; GCN-NEXT:       0x2e13 (COMPUTE_PGM_RSRC2):
+; GCN-NEXT:       '0x2e12 (COMPUTE_PGM_RSRC1)':
+; GCN-NEXT:       '0x2e13 (COMPUTE_PGM_RSRC2)':
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 define amdgpu_cs half @cs_amdpal(half %arg0) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-es.ll b/llvm/test/CodeGen/AMDGPU/amdpal-es.ll
index 012b2061756b..679e0858819e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-es.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-es.ll
@@ -10,7 +10,7 @@
 ; GCN-NEXT:         .entry_point:    es_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
-; GCN-NEXT:       0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0
+; GCN-NEXT:       '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 define amdgpu_es half @es_amdpal(half %arg0) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll
index e2f67398d18a..75f7a1dc266d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-gs.ll
@@ -11,7 +11,7 @@
 ; GCN-NEXT:         .entry_point:    gs_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
-; GCN-NEXT:       0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0
+; GCN-NEXT:       '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 define amdgpu_gs half @gs_amdpal(half %arg0) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll
index 9ad47c1d604f..c61578a967b6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-hs.ll
@@ -11,7 +11,7 @@
 ; GCN-NEXT:         .entry_point:    hs_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
-; GCN-NEXT:       0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0
+; GCN-NEXT:       '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 define amdgpu_hs half @hs_amdpal(half %arg0) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll b/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll
index 8ee6f7283ce7..8162c824dc2c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-ls.ll
@@ -10,7 +10,7 @@
 ; GCN-NEXT:         .entry_point:    ls_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
-; GCN-NEXT:       0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0
+; GCN-NEXT:       '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 define amdgpu_ls half @ls_amdpal(half %arg0) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-cs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-cs.ll
index 0d0c70c38ace..5e21ba494df1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-cs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-cs.ll
@@ -5,7 +5,7 @@
 ; amdpal compute shader: check for 0x2e12 (COMPUTE_PGM_RSRC1) in pal metadata
 ; GCN-LABEL: {{^}}cs_amdpal:
 ; GCN: .amdgpu_pal_metadata
-; GCN: 0x2e12 (COMPUTE_PGM_RSRC1)
+; GCN: '0x2e12 (COMPUTE_PGM_RSRC1)'
 define amdgpu_cs half @cs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-default.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-default.ll
index b82e3ebdde4b..dc9a33ac0141 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-default.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-default.ll
@@ -3,45 +3,45 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -enable-var-scope %s
 
 ; amdpal compute shader: check for 0x2e12 (COMPUTE_PGM_RSRC1) in pal metadata
-; SI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0x2f0000{{$}}
-; VI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0x2f02c0{{$}}
-; GFX9-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0x2f0000{{$}}
+; SI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x2f0000{{$}}
+; VI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x2f02c0{{$}}
+; GFX9-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x2f0000{{$}}
 define amdgpu_cs half @cs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal evaluation shader: check for 0x2cca (SPI_SHADER_PGM_RSRC1_ES) in pal metadata
-; SI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0x2f0000{{$}}
-; VI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0x2f02c0{{$}}
-; GFX9-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0x2f0000{{$}}
+; SI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0x2f0000{{$}}
+; VI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0x2f02c0{{$}}
+; GFX9-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0x2f0000{{$}}
 define amdgpu_es half @es_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal geometry shader: check for 0x2c8a (SPI_SHADER_PGM_RSRC1_GS) in pal metadata
-; SI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0x2f0000{{$}}
-; VI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0x2f02c0{{$}}
-; GFX9-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0x2f0000{{$}}
+; SI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0x2f0000{{$}}
+; VI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0x2f02c0{{$}}
+; GFX9-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0x2f0000{{$}}
 define amdgpu_gs half @gs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal hull shader: check for 0x2d0a (SPI_SHADER_PGM_RSRC1_HS) in pal metadata
-; SI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0x2f0000{{$}}
-; VI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0x2f02c0{{$}}
-; GFX9-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0x2f0000{{$}}
+; SI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0x2f0000{{$}}
+; VI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0x2f02c0{{$}}
+; GFX9-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0x2f0000{{$}}
 define amdgpu_hs half @hs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal load shader: check for 0x2d4a (SPI_SHADER_PGM_RSRC1_LS) in pal metadata
-; SI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0x2f0000{{$}}
-; VI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0x2f02c0{{$}}
-; GFX9-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0x2f0000{{$}}
+; SI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0x2f0000{{$}}
+; VI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0x2f02c0{{$}}
+; GFX9-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0x2f0000{{$}}
 define amdgpu_ls half @ls_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -49,18 +49,18 @@ define amdgpu_ls half @ls_amdpal(half %arg0) {
 
 ; amdpal pixel shader: check for 0x2c0a (SPI_SHADER_PGM_RSRC1_PS) in pal metadata
 ; below.
-; SI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x2f0000{{$}}
-; VI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x2f02c0{{$}}
-; GFX9-DAG:         0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x2f0000{{$}}
+; SI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0x2f0000{{$}}
+; VI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0x2f02c0{{$}}
+; GFX9-DAG:         '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0x2f0000{{$}}
 define amdgpu_ps half @ps_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal vertex shader: check for 45352 (SPI_SHADER_PGM_RSRC1_VS) in pal metadata
-; SI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x2f0000{{$}}
-; VI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x2f02c0{{$}}
-; GFX9-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x2f0000{{$}}
+; SI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0x2f0000{{$}}
+; VI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0x2f02c0{{$}}
+; GFX9-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0x2f0000{{$}}
 define amdgpu_vs half @vs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -75,7 +75,7 @@ define amdgpu_vs half @vs_amdpal(half %arg0) {
 ;       - 0x123456789abcdef0
 ;       - 0xfedcba9876543210
 ;     .registers:
-;       0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42000000
+;       '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42000000
 ; ...
 ; 	.end_amdgpu_pal_metadata
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-denormal.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-denormal.ll
index b86b42868005..ffce3ed08509 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-denormal.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-denormal.ll
@@ -3,45 +3,45 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -enable-var-scope %s
 
 ; amdpal compute shader: check for 0x2e12 (COMPUTE_PGM_RSRC1) in pal metadata
-; SI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0x2c0000{{$}}
-; VI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0x2c02c0{{$}}
-; GFX9-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0x2c0000{{$}}
+; SI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x2c0000{{$}}
+; VI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x2c02c0{{$}}
+; GFX9-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x2c0000{{$}}
 define amdgpu_cs half @cs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal evaluation shader: check for 0x2cca (SPI_SHADER_PGM_RSRC1_ES) in pal metadata
-; SI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0x2c0000{{$}}
-; VI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0x2c02c0{{$}}
-; GFX9-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0x2c0000{{$}}
+; SI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0x2c0000{{$}}
+; VI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0x2c02c0{{$}}
+; GFX9-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0x2c0000{{$}}
 define amdgpu_es half @es_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal geometry shader: check for 0x2c8a (SPI_SHADER_PGM_RSRC1_GS) in pal metadata
-; SI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0x2c0000{{$}}
-; VI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0x2c02c0{{$}}
-; GFX9-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0x2c0000{{$}}
+; SI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0x2c0000{{$}}
+; VI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0x2c02c0{{$}}
+; GFX9-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0x2c0000{{$}}
 define amdgpu_gs half @gs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal hull shader: check for 0x2d0a (SPI_SHADER_PGM_RSRC1_HS) in pal metadata
-; SI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0x2c0000{{$}}
-; VI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0x2c02c0{{$}}
-; GFX9-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0x2c0000{{$}}
+; SI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0x2c0000{{$}}
+; VI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0x2c02c0{{$}}
+; GFX9-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0x2c0000{{$}}
 define amdgpu_hs half @hs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal load shader: check for 0x2d4a (SPI_SHADER_PGM_RSRC1_LS) in pal metadata
-; SI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0x2c0000{{$}}
-; VI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0x2c02c0{{$}}
-; GFX9-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0x2c0000{{$}}
+; SI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0x2c0000{{$}}
+; VI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0x2c02c0{{$}}
+; GFX9-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0x2c0000{{$}}
 define amdgpu_ls half @ls_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -49,18 +49,18 @@ define amdgpu_ls half @ls_amdpal(half %arg0) #0 {
 
 ; amdpal pixel shader: check for 0x2c0a (SPI_SHADER_PGM_RSRC1_PS) in pal metadata
 ; below.
-; SI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x2c0000{{$}}
-; VI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x2c02c0{{$}}
-; GFX9-DAG:         0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x2c0000{{$}}
+; SI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0x2c0000{{$}}
+; VI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0x2c02c0{{$}}
+; GFX9-DAG:         '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0x2c0000{{$}}
 define amdgpu_ps half @ps_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal vertex shader: check for 45352 (SPI_SHADER_PGM_RSRC1_VS) in pal metadata
-; SI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x2c0000{{$}}
-; VI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x2c02c0{{$}}
-; GFX9-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x2c0000{{$}}
+; SI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0x2c0000{{$}}
+; VI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0x2c02c0{{$}}
+; GFX9-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0x2c0000{{$}}
 define amdgpu_vs half @vs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -77,7 +77,7 @@ attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 ;       - 0x123456789abcdef0
 ;       - 0xfedcba9876543210
 ;     .registers:
-;       0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42000000
+;       '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42000000
 ; ...
 ; 	.end_amdgpu_pal_metadata
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-dx10-clamp.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-dx10-clamp.ll
index b1db7aafacab..3ea3064fa743 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-dx10-clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-dx10-clamp.ll
@@ -3,45 +3,45 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -enable-var-scope %s
 
 ; amdpal compute shader: check for 0x2e12 (COMPUTE_PGM_RSRC1) in pal metadata
-; SI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0xf0000{{$}}
-; VI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0xf02c0{{$}}
-; GFX9-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0xf0000{{$}}
+; SI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xf0000{{$}}
+; VI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xf02c0{{$}}
+; GFX9-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xf0000{{$}}
 define amdgpu_cs half @cs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal evaluation shader: check for 0x2cca (SPI_SHADER_PGM_RSRC1_ES) in pal metadata
-; SI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0xf0000{{$}}
-; VI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0xf02c0{{$}}
-; GFX9-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0xf0000{{$}}
+; SI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0xf0000{{$}}
+; VI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0xf02c0{{$}}
+; GFX9-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0xf0000{{$}}
 define amdgpu_es half @es_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal geometry shader: check for 0x2c8a (SPI_SHADER_PGM_RSRC1_GS) in pal metadata
-; SI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0xf0000{{$}}
-; VI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0xf02c0{{$}}
-; GFX9-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0xf0000{{$}}
+; SI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0xf0000{{$}}
+; VI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0xf02c0{{$}}
+; GFX9-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0xf0000{{$}}
 define amdgpu_gs half @gs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal hull shader: check for 0x2d0a (SPI_SHADER_PGM_RSRC1_HS) in pal metadata
-; SI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0xf0000{{$}}
-; VI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0xf02c0{{$}}
-; GFX9-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0xf0000{{$}}
+; SI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0xf0000{{$}}
+; VI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0xf02c0{{$}}
+; GFX9-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0xf0000{{$}}
 define amdgpu_hs half @hs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal load shader: check for 0x2d4a (SPI_SHADER_PGM_RSRC1_LS) in pal metadata
-; SI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0xf0000{{$}}
-; VI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0xf02c0{{$}}
-; GFX9-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0xf0000{{$}}
+; SI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0xf0000{{$}}
+; VI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0xf02c0{{$}}
+; GFX9-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0xf0000{{$}}
 define amdgpu_ls half @ls_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -49,18 +49,18 @@ define amdgpu_ls half @ls_amdpal(half %arg0) #0 {
 
 ; amdpal pixel shader: check for 0x2c0a (SPI_SHADER_PGM_RSRC1_PS) in pal metadata
 ; below.
-; SI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0xf0000{{$}}
-; VI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0xf02c0{{$}}
-; GFX9-DAG:         0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0xf0000{{$}}
+; SI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0xf0000{{$}}
+; VI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0xf02c0{{$}}
+; GFX9-DAG:         '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0xf0000{{$}}
 define amdgpu_ps half @ps_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal vertex shader: check for 45352 (SPI_SHADER_PGM_RSRC1_VS) in pal metadata
-; SI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0xf0000{{$}}
-; VI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0xf02c0{{$}}
-; GFX9-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0xf0000{{$}}
+; SI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0xf0000{{$}}
+; VI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0xf02c0{{$}}
+; GFX9-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0xf0000{{$}}
 define amdgpu_vs half @vs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -77,7 +77,7 @@ attributes #0 = { "amdgpu-dx10-clamp"="false" }
 ;       - 0x123456789abcdef0
 ;       - 0xfedcba9876543210
 ;     .registers:
-;       0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42000000
+;       '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42000000
 ; ...
 ; 	.end_amdgpu_pal_metadata
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-es.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-es.ll
index f97117f3d909..bcc8da6e1bf4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-es.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-es.ll
@@ -4,7 +4,7 @@
 ; amdpal evaluation shader: check for 0x2cca (SPI_SHADER_PGM_RSRC1_ES) in pal metadata
 ; GCN-LABEL: {{^}}es_amdpal:
 ; GCN: .amdgpu_pal_metadata
-; GCN: 0x2cca (SPI_SHADER_PGM_RSRC1_ES)
+; GCN: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)'
 define amdgpu_es half @es_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-gs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-gs.ll
index a32d10390b98..ef4c9cbd5006 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-gs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-gs.ll
@@ -5,7 +5,7 @@
 ; amdpal geometry shader: check for 0x2c8a (SPI_SHADER_PGM_RSRC1_GS) in pal metadata
 ; GCN-LABEL: {{^}}gs_amdpal:
 ; GCN: .amdgpu_pal_metadata
-; GCN: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS)
+; GCN: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)'
 define amdgpu_gs half @gs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-hs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-hs.ll
index be08c93cdb31..eb814c11bceb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-hs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-hs.ll
@@ -5,7 +5,7 @@
 ; amdpal hull shader: check for 0x2d0a (SPI_SHADER_PGM_RSRC1_HS) in pal metadata
 ; GCN-LABEL: {{^}}hs_amdpal:
 ; GCN: .amdgpu_pal_metadata
-; GCN: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS)
+; GCN: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)'
 define amdgpu_hs half @hs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll
index 95d533544c30..d4826a22db79 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll
@@ -4,50 +4,50 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 -enable-var-scope %s
 
 ; amdpal compute shader: check for 0x2e12 (COMPUTE_PGM_RSRC1) in pal metadata
-; SI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf0000{{$}}
-; VI-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf02c0{{$}}
-; GFX9-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0xaf0000{{$}}
-; GFX12-DAG: 0x2e12 (COMPUTE_PGM_RSRC1): 0x600f0000{{$}}
+; SI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf0000{{$}}
+; VI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf02c0{{$}}
+; GFX9-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf0000{{$}}
+; GFX12-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x600f0000{{$}}
 define amdgpu_cs half @cs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal evaluation shader: check for 0x2cca (SPI_SHADER_PGM_RSRC1_ES) in pal metadata
-; SI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0xaf0000{{$}}
-; VI-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0xaf02c0{{$}}
-; GFX9-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0xaf0000{{$}}
-; GFX12-DAG: 0x2cca (SPI_SHADER_PGM_RSRC1_ES): 0xf0000{{$}}
+; SI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0xaf0000{{$}}
+; VI-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0xaf02c0{{$}}
+; GFX9-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0xaf0000{{$}}
+; GFX12-DAG: '0x2cca (SPI_SHADER_PGM_RSRC1_ES)': 0xf0000{{$}}
 define amdgpu_es half @es_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal geometry shader: check for 0x2c8a (SPI_SHADER_PGM_RSRC1_GS) in pal metadata
-; SI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0xaf0000{{$}}
-; VI-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0xaf02c0{{$}}
-; GFX9-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0xaf0000{{$}}
-; GFX12-DAG: 0x2c8a (SPI_SHADER_PGM_RSRC1_GS): 0xa0f0000{{$}}
+; SI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0xaf0000{{$}}
+; VI-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0xaf02c0{{$}}
+; GFX9-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0xaf0000{{$}}
+; GFX12-DAG: '0x2c8a (SPI_SHADER_PGM_RSRC1_GS)': 0xa0f0000{{$}}
 define amdgpu_gs half @gs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal hull shader: check for 0x2d0a (SPI_SHADER_PGM_RSRC1_HS) in pal metadata
-; SI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0xaf0000{{$}}
-; VI-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0xaf02c0{{$}}
-; GFX9-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0xaf0000{{$}}
-; GFX12-DAG: 0x2d0a (SPI_SHADER_PGM_RSRC1_HS): 0x50f0000{{$}}
+; SI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0xaf0000{{$}}
+; VI-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0xaf02c0{{$}}
+; GFX9-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0xaf0000{{$}}
+; GFX12-DAG: '0x2d0a (SPI_SHADER_PGM_RSRC1_HS)': 0x50f0000{{$}}
 define amdgpu_hs half @hs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal load shader: check for 0x2d4a (SPI_SHADER_PGM_RSRC1_LS) in pal metadata
-; SI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0xaf0000{{$}}
-; VI-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0xaf02c0{{$}}
-; GFX9-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0xaf0000{{$}}
-; GFX12-DAG: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS): 0xf0000{{$}}
+; SI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0xaf0000{{$}}
+; VI-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0xaf02c0{{$}}
+; GFX9-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0xaf0000{{$}}
+; GFX12-DAG: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)': 0xf0000{{$}}
 define amdgpu_ls half @ls_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -55,20 +55,20 @@ define amdgpu_ls half @ls_amdpal(half %arg0) #0 {
 
 ; amdpal pixel shader: check for 0x2c0a (SPI_SHADER_PGM_RSRC1_PS) in pal metadata
 ; below.
-; SI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0xaf0000{{$}}
-; VI-DAG:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0xaf02c0{{$}}
-; GFX9-DAG:         0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0xaf0000{{$}}
-; GFX12-DAG:        0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x20f0000{{$}}
+; SI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0xaf0000{{$}}
+; VI-DAG:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0xaf02c0{{$}}
+; GFX9-DAG:         '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0xaf0000{{$}}
+; GFX12-DAG:        '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0x20f0000{{$}}
 define amdgpu_ps half @ps_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; amdpal vertex shader: check for 45352 (SPI_SHADER_PGM_RSRC1_VS) in pal metadata
-; SI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0xaf0000{{$}}
-; VI-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0xaf02c0{{$}}
-; GFX9-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0xaf0000{{$}}
-; GFX12-DAG: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x80f0000{{$}}
+; SI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0xaf0000{{$}}
+; VI-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0xaf02c0{{$}}
+; GFX9-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0xaf0000{{$}}
+; GFX12-DAG: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0x80f0000{{$}}
 define amdgpu_vs half @vs_amdpal(half %arg0) #0 {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -85,7 +85,7 @@ attributes #0 = { "amdgpu-ieee"="true" }
 ;       - 0x123456789abcdef0
 ;       - 0xfedcba9876543210
 ;     .registers:
-;       0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42000000
+;       '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42000000
 ; ...
 ; 	.end_amdgpu_pal_metadata
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ls.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ls.ll
index 46097fa20608..0d81e70b2e4f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ls.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ls.ll
@@ -4,7 +4,7 @@
 ; amdpal load shader: check for 0x2d4a (SPI_SHADER_PGM_RSRC1_LS) in pal metadata
 ; GCN-LABEL: {{^}}ls_amdpal:
 ; GCN: .amdgpu_pal_metadata
-; GCN: 0x2d4a (SPI_SHADER_PGM_RSRC1_LS)
+; GCN: '0x2d4a (SPI_SHADER_PGM_RSRC1_LS)'
 define amdgpu_ls half @ls_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ps.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ps.ll
index 9169c651f129..d31732f995b1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ps.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ps.ll
@@ -12,8 +12,8 @@
 ; GCN-NEXT:      - 0x123456789abcdef0
 ; GCN-NEXT:      - 0xfedcba9876543210
 ; GCN:         .registers:
-; GCN:           0x2c0a (SPI_SHADER_PGM_RSRC1_PS):
-; GCN:           0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42
+; GCN:           '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)':
+; GCN:           '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42
 define amdgpu_ps half @ps_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -23,12 +23,12 @@ define amdgpu_ps half @ps_amdpal(half %arg0) {
 ;
 ; 	.amdgpu_pal_metadata
 ; ---
-; amdpal.pipelines: 
+; amdpal.pipelines:
 ;   - .internal_pipeline_hash:
 ;       - 0x123456789abcdef0
 ;       - 0xfedcba9876543210
 ;     .registers:
-;       0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42000000
+;       '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42000000
 ; ...
 ; 	.end_amdgpu_pal_metadata
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-psenable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-psenable.ll
index d6322e2b4d3e..15b1a652077e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-psenable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-psenable.ll
@@ -7,8 +7,8 @@
 ; the workaround that ensures that an interpolation mode is also set in PSEnable.
 ; GCN-LABEL: {{^}}amdpal_psenable:
 ; GCN: .amdgpu_pal_metadata
-; GCN: 0xa1b3 (SPI_PS_INPUT_ENA): 0x2
-; GCN: 0xa1b4 (SPI_PS_INPUT_ADDR): 0x2
+; GCN: '0xa1b3 (SPI_PS_INPUT_ENA)': 0x2
+; GCN: '0xa1b4 (SPI_PS_INPUT_ADDR)': 0x2
 define amdgpu_ps void @amdpal_psenable(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <2 x float> %pos) #6 {
   %inst23 = extractelement <2 x float> %pos, i32 0
   %inst24 = extractelement <2 x float> %pos, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-vs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-vs.ll
index 7c47129c28ce..42de6007f7e2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-vs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-vs.ll
@@ -5,7 +5,7 @@
 ; amdpal vertex shader: check for 45352 (SPI_SHADER_PGM_RSRC1_VS) in pal metadata
 ; GCN-LABEL: {{^}}vs_amdpal:
 ; GCN: .amdgpu_pal_metadata
-; GCN: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS)
+; GCN: '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)'
 define amdgpu_vs half @vs_amdpal(half %arg0) {
   %add = fadd half %arg0, 1.0
   ret half %add
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll
index 13d2050c491f..ace21207a7eb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-psenable.ll
@@ -14,10 +14,10 @@
 ; GCN-NEXT:         .entry_point:    amdpal_psenable
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
-; GCN-NEXT:       0x2c0a (SPI_SHADER_PGM_RSRC1_PS):
-; GCN-NEXT:       0x2c0b (SPI_SHADER_PGM_RSRC2_PS):
-; GCN-NEXT:       0xa1b3 (SPI_PS_INPUT_ENA): 0x2
-; GCN-NEXT:       0xa1b4 (SPI_PS_INPUT_ADDR): 0x2
+; GCN-NEXT:       '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)':
+; GCN-NEXT:       '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)':
+; GCN-NEXT:       '0xa1b3 (SPI_PS_INPUT_ENA)': 0x2
+; GCN-NEXT:       '0xa1b4 (SPI_PS_INPUT_ADDR)': 0x2
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 define amdgpu_ps void @amdpal_psenable(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <2 x float> %pos) #6 {
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-usersgpr-init.ll b/llvm/test/CodeGen/AMDGPU/amdpal-usersgpr-init.ll
index 52a9d57244c2..086a126b1ddc 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-usersgpr-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-usersgpr-init.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -enable-var-scope %s
 
 ; We want to make sure that RSRC2 is left untouched
-; GCN:       0x2e13 (COMPUTE_PGM_RSRC2): 0x78a
+; GCN:       '0x2e13 (COMPUTE_PGM_RSRC2)': 0x78a
 define amdgpu_cs half @cs_amdpal(half %arg0, half inreg %arg1) {
   %add = fadd half %arg0, 1.0
   ret half %add
@@ -9,4 +9,4 @@ define amdgpu_cs half @cs_amdpal(half %arg0, half inreg %arg1) {
 
 !amdgpu.pal.metadata.msgpack = !{!0}
 
-!0 = !{!"\82\B0amdpal.pipelines\91\89\A4.api\A6Vulkan\B0.hardware_stages\81\A3.cs\83\AB.sgpr_limith\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E83\B3\C2\D1)\7FG\CF[\8A\DF\EE[\7FD,\AA.registers\8A\CD.\07\01\CD.\08\01\CD.\09\01\CD.\12\CE@,\00\00\CD.\13\CD\07\8A\CD.(\00\CD.*\CE\16\0B\22Y\CD.@\CE\10\00\00\00\CD.B\CE\10\00\00\06\CD.D\00\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\D3s\A6\8D\C5x\84\D4\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\E5\A0\EB\F9}\C6\C1\13\CF\1A_\E7\F7\F2.mR\AD.llpc_version\A454.5\AEamdpal.version\92\02\03"}
-\ No newline at end of file
+!0 = !{!"\82\B0amdpal.pipelines\91\89\A4.api\A6Vulkan\B0.hardware_stages\81\A3.cs\83\AB.sgpr_limith\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E83\B3\C2\D1)\7FG\CF[\8A\DF\EE[\7FD,\AA.registers\8A\CD.\07\01\CD.\08\01\CD.\09\01\CD.\12\CE@,\00\00\CD.\13\CD\07\8A\CD.(\00\CD.*\CE\16\0B\22Y\CD.@\CE\10\00\00\00\CD.B\CE\10\00\00\06\CD.D\00\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\D3s\A6\8D\C5x\84\D4\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\E5\A0\EB\F9}\C6\C1\13\CF\1A_\E7\F7\F2.mR\AD.llpc_version\A454.5\AEamdpal.version\92\02\03"}
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll b/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll
index ec8f698d69c2..c300ba187740 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-vs.ll
@@ -11,7 +11,7 @@
 ; GCN-NEXT:         .entry_point:    vs_amdpal
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; GCN:     .registers:
-; GCN-NEXT:       0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0
+; GCN-NEXT:       '0x2c4a (SPI_SHADER_PGM_RSRC1_VS)': 0
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 define amdgpu_vs half @vs_amdpal(half %arg0) {
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index cf99b5d80e13..b2f9bf89d9ec 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -282,21 +282,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v16
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v14
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v9, vcc, v9, v4
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v13, vcc, v10, v4, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v6, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v13, vcc, v10, v4, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v5, v1
@@ -312,21 +312,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v11
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 killed $vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v12
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v3
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v8, v3, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v8, v5, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v7, v3, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v2, v5, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v5, v5, v6
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
@@ -339,18 +339,26 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX9-O0-NEXT:    v_or_b32_e64 v3, v8, v7
@@ -403,7 +411,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[11:12], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[12:13]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
@@ -439,7 +448,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[13:14], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
@@ -690,10 +700,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
@@ -903,14 +913,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -1028,10 +1038,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
index e376c3df1ac9..96ec90b1f4d0 100644
--- a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
@@ -5,12 +5,12 @@
 
 ; Check EXTRA_LDS_SIZE in SPI_SHADER_PGM_RSRC2_PS.
 
-; GFX10-PAL: 0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x800
+; GFX10-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x800
 
 ; GFX10-MESA: .long 45100
 ; GFX10-MESA-NEXT: .long 2048
 
-; GFX11-PAL: 0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x400
+; GFX11-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x400
 
 ; GFX11-MESA: .long 45100
 ; GFX11-MESA-NEXT: .long 1024
diff --git a/llvm/test/CodeGen/AMDGPU/fp_trunc_store_bf16.ll b/llvm/test/CodeGen/AMDGPU/fp_trunc_store_bf16.ll
new file mode 100644
index 000000000000..5aaff773689f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fp_trunc_store_bf16.ll
@@ -0,0 +1,426 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s
+
+define void @v2(<2 x float> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v2:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v4, v4, v0, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; CHECK-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v1, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
+; CHECK-NEXT:    s_mov_b32 s4, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v0, v1, v0, s4
+; CHECK-NEXT:    global_store_dword v[2:3], v0, off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <2 x float> %num to <2 x bfloat>
+  store <2 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v4(<4 x float> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v4:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v6, v6, v2, s4
+; CHECK-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
+; CHECK-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v3, s4
+; CHECK-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; CHECK-NEXT:    s_mov_b32 s5, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v3, v3, v2, s5
+; CHECK-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v0, s4
+; CHECK-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
+; CHECK-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v1, s4
+; CHECK-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc
+; CHECK-NEXT:    v_perm_b32 v2, v1, v0, s5
+; CHECK-NEXT:    global_store_dwordx2 v[4:5], v[2:3], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <4 x float> %num to <4 x bfloat>
+  store <4 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v8(<8 x float> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v8:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v10, v10, v6, s4
+; CHECK-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v6, v10, v11, vcc
+; CHECK-NEXT:    v_bfe_u32 v10, v7, 16, 1
+; CHECK-NEXT:    v_add3_u32 v10, v10, v7, s4
+; CHECK-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; CHECK-NEXT:    v_cndmask_b32_e32 v7, v10, v11, vcc
+; CHECK-NEXT:    s_mov_b32 s5, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v7, v7, v6, s5
+; CHECK-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v4, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v4
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v6, v10, vcc
+; CHECK-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v5, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, v6, v10, vcc
+; CHECK-NEXT:    v_perm_b32 v6, v5, v4, s5
+; CHECK-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v2, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; CHECK-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v3, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; CHECK-NEXT:    v_perm_b32 v5, v3, v2, s5
+; CHECK-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v0, s4
+; CHECK-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; CHECK-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v1, s4
+; CHECK-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; CHECK-NEXT:    v_perm_b32 v4, v1, v0, s5
+; CHECK-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <8 x float> %num to <8 x bfloat>
+  store <8 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v16(<16 x float> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v16:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_bfe_u32 v18, v6, 16, 1
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v18, v18, v6, s4
+; CHECK-NEXT:    v_or_b32_e32 v19, 0x400000, v6
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v6, v18, v19, vcc
+; CHECK-NEXT:    v_bfe_u32 v18, v7, 16, 1
+; CHECK-NEXT:    v_add3_u32 v18, v18, v7, s4
+; CHECK-NEXT:    v_or_b32_e32 v19, 0x400000, v7
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; CHECK-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc
+; CHECK-NEXT:    s_mov_b32 s5, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v7, v7, v6, s5
+; CHECK-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v4, s4
+; CHECK-NEXT:    v_or_b32_e32 v18, 0x400000, v4
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v6, v18, vcc
+; CHECK-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v5, s4
+; CHECK-NEXT:    v_or_b32_e32 v18, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, v6, v18, vcc
+; CHECK-NEXT:    v_perm_b32 v6, v5, v4, s5
+; CHECK-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v2, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; CHECK-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v3, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; CHECK-NEXT:    v_perm_b32 v5, v3, v2, s5
+; CHECK-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v0, s4
+; CHECK-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; CHECK-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v1, s4
+; CHECK-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; CHECK-NEXT:    v_perm_b32 v4, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v14, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v14, s4
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v14
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_bfe_u32 v1, v15, 16, 1
+; CHECK-NEXT:    v_add3_u32 v1, v1, v15, s4
+; CHECK-NEXT:    v_or_b32_e32 v2, 0x400000, v15
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT:    v_perm_b32 v3, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v12, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v12, s4
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v12
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_bfe_u32 v1, v13, 16, 1
+; CHECK-NEXT:    v_add3_u32 v1, v1, v13, s4
+; CHECK-NEXT:    v_or_b32_e32 v2, 0x400000, v13
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT:    v_perm_b32 v2, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v10, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v10, s4
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v10
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_bfe_u32 v1, v11, 16, 1
+; CHECK-NEXT:    v_add3_u32 v1, v1, v11, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v11
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
+; CHECK-NEXT:    v_perm_b32 v1, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v8, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v8, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v8
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; CHECK-NEXT:    v_bfe_u32 v8, v9, 16, 1
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; CHECK-NEXT:    v_add3_u32 v8, v8, v9, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; CHECK-NEXT:    v_perm_b32 v0, v8, v0, s5
+; CHECK-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[16:17], v[4:7], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <16 x float> %num to <16 x bfloat>
+  store <16 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
+
+define void @v32(<32 x float> %num, ptr addrspace(1) %p) {
+; CHECK-LABEL: v32:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; CHECK-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; CHECK-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CHECK-NEXT:    v_bfe_u32 v34, v6, 16, 1
+; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
+; CHECK-NEXT:    v_add3_u32 v34, v34, v6, s4
+; CHECK-NEXT:    v_or_b32_e32 v35, 0x400000, v6
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v6, v34, v35, vcc
+; CHECK-NEXT:    v_bfe_u32 v34, v7, 16, 1
+; CHECK-NEXT:    v_add3_u32 v34, v34, v7, s4
+; CHECK-NEXT:    v_or_b32_e32 v35, 0x400000, v7
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; CHECK-NEXT:    v_cndmask_b32_e32 v7, v34, v35, vcc
+; CHECK-NEXT:    s_mov_b32 s5, 0x7060302
+; CHECK-NEXT:    v_perm_b32 v7, v7, v6, s5
+; CHECK-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v4, s4
+; CHECK-NEXT:    v_or_b32_e32 v34, 0x400000, v4
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v6, v34, vcc
+; CHECK-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v5, s4
+; CHECK-NEXT:    v_or_b32_e32 v34, 0x400000, v5
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, v6, v34, vcc
+; CHECK-NEXT:    v_perm_b32 v6, v5, v4, s5
+; CHECK-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v2, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; CHECK-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; CHECK-NEXT:    v_add3_u32 v4, v4, v3, s4
+; CHECK-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; CHECK-NEXT:    v_perm_b32 v5, v3, v2, s5
+; CHECK-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v0, s4
+; CHECK-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; CHECK-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; CHECK-NEXT:    v_add3_u32 v2, v2, v1, s4
+; CHECK-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; CHECK-NEXT:    v_perm_b32 v4, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v14, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v14, s4
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v14
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_bfe_u32 v1, v15, 16, 1
+; CHECK-NEXT:    v_add3_u32 v1, v1, v15, s4
+; CHECK-NEXT:    v_or_b32_e32 v2, 0x400000, v15
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT:    v_perm_b32 v3, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v12, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v12, s4
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v12
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_bfe_u32 v1, v13, 16, 1
+; CHECK-NEXT:    v_add3_u32 v1, v1, v13, s4
+; CHECK-NEXT:    v_or_b32_e32 v2, 0x400000, v13
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT:    v_perm_b32 v2, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v10, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v10, s4
+; CHECK-NEXT:    v_or_b32_e32 v1, 0x400000, v10
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_bfe_u32 v1, v11, 16, 1
+; CHECK-NEXT:    v_add3_u32 v1, v1, v11, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v11
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
+; CHECK-NEXT:    v_perm_b32 v1, v1, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v0, v8, 16, 1
+; CHECK-NEXT:    v_add3_u32 v0, v0, v8, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v8
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; CHECK-NEXT:    v_bfe_u32 v8, v9, 16, 1
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; CHECK-NEXT:    v_add3_u32 v8, v8, v9, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; CHECK-NEXT:    v_perm_b32 v0, v8, v0, s5
+; CHECK-NEXT:    v_bfe_u32 v8, v22, 16, 1
+; CHECK-NEXT:    v_add3_u32 v8, v8, v22, s4
+; CHECK-NEXT:    v_or_b32_e32 v9, 0x400000, v22
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT:    v_bfe_u32 v9, v23, 16, 1
+; CHECK-NEXT:    v_add3_u32 v9, v9, v23, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v23
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
+; CHECK-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
+; CHECK-NEXT:    v_perm_b32 v11, v9, v8, s5
+; CHECK-NEXT:    v_bfe_u32 v8, v20, 16, 1
+; CHECK-NEXT:    v_add3_u32 v8, v8, v20, s4
+; CHECK-NEXT:    v_or_b32_e32 v9, 0x400000, v20
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT:    v_bfe_u32 v9, v21, 16, 1
+; CHECK-NEXT:    v_add3_u32 v9, v9, v21, s4
+; CHECK-NEXT:    v_or_b32_e32 v10, 0x400000, v21
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
+; CHECK-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
+; CHECK-NEXT:    v_perm_b32 v10, v9, v8, s5
+; CHECK-NEXT:    v_bfe_u32 v8, v18, 16, 1
+; CHECK-NEXT:    v_add3_u32 v8, v8, v18, s4
+; CHECK-NEXT:    v_or_b32_e32 v9, 0x400000, v18
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; CHECK-NEXT:    v_bfe_u32 v9, v19, 16, 1
+; CHECK-NEXT:    v_add3_u32 v9, v9, v19, s4
+; CHECK-NEXT:    v_or_b32_e32 v12, 0x400000, v19
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
+; CHECK-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
+; CHECK-NEXT:    v_perm_b32 v9, v9, v8, s5
+; CHECK-NEXT:    v_bfe_u32 v8, v16, 16, 1
+; CHECK-NEXT:    v_add3_u32 v8, v8, v16, s4
+; CHECK-NEXT:    v_or_b32_e32 v12, 0x400000, v16
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
+; CHECK-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc
+; CHECK-NEXT:    v_bfe_u32 v12, v17, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v17, s4
+; CHECK-NEXT:    v_or_b32_e32 v13, 0x400000, v17
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; CHECK-NEXT:    v_perm_b32 v8, v12, v8, s5
+; CHECK-NEXT:    v_bfe_u32 v12, v30, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v30, s4
+; CHECK-NEXT:    v_or_b32_e32 v13, 0x400000, v30
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_bfe_u32 v13, v31, 16, 1
+; CHECK-NEXT:    v_add3_u32 v13, v13, v31, s4
+; CHECK-NEXT:    v_or_b32_e32 v14, 0x400000, v31
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
+; CHECK-NEXT:    v_cndmask_b32_e32 v13, v13, v14, vcc
+; CHECK-NEXT:    v_perm_b32 v15, v13, v12, s5
+; CHECK-NEXT:    v_bfe_u32 v12, v28, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v28, s4
+; CHECK-NEXT:    v_or_b32_e32 v13, 0x400000, v28
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; CHECK-NEXT:    v_bfe_u32 v13, v29, 16, 1
+; CHECK-NEXT:    v_add3_u32 v13, v13, v29, s4
+; CHECK-NEXT:    v_or_b32_e32 v14, 0x400000, v29
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
+; CHECK-NEXT:    v_cndmask_b32_e32 v13, v13, v14, vcc
+; CHECK-NEXT:    v_perm_b32 v14, v13, v12, s5
+; CHECK-NEXT:    v_bfe_u32 v12, v26, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v26, s4
+; CHECK-NEXT:    v_or_b32_e32 v13, 0x400000, v26
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; CHECK-NEXT:    v_bfe_u32 v13, v27, 16, 1
+; CHECK-NEXT:    v_add3_u32 v13, v13, v27, s4
+; CHECK-NEXT:    v_or_b32_e32 v16, 0x400000, v27
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
+; CHECK-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc
+; CHECK-NEXT:    v_perm_b32 v13, v13, v12, s5
+; CHECK-NEXT:    v_bfe_u32 v12, v24, 16, 1
+; CHECK-NEXT:    v_add3_u32 v12, v12, v24, s4
+; CHECK-NEXT:    v_or_b32_e32 v16, 0x400000, v24
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; CHECK-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
+; CHECK-NEXT:    v_bfe_u32 v16, v25, 16, 1
+; CHECK-NEXT:    v_add3_u32 v16, v16, v25, s4
+; CHECK-NEXT:    v_or_b32_e32 v17, 0x400000, v25
+; CHECK-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; CHECK-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
+; CHECK-NEXT:    v_perm_b32 v12, v16, v12, s5
+; CHECK-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off offset:48
+; CHECK-NEXT:    global_store_dwordx4 v[32:33], v[8:11], off offset:32
+; CHECK-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[32:33], v[4:7], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %conv = fptrunc <32 x float> %num to <32 x bfloat>
+  store <32 x bfloat> %conv, ptr addrspace(1) %p, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx10.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx10.mir
new file mode 100644
index 000000000000..50eea4aebd5e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx10.mir
@@ -0,0 +1,34 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s
+
+---
+name: mimg_nsa
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-LABEL: name: mimg_nsa
+    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT: $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+    $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+...
+
+---
+name: mimg_nsa_mixed
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-LABEL: name: mimg_nsa_mixed
+    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+    ; CHECK-NEXT: $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+    $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+    $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+...
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
new file mode 100644
index 000000000000..b22de06e68a7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
@@ -0,0 +1,40 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s
+
+---
+name: mimg_nsa
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-LABEL: name: mimg_nsa
+    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr10_vgpr11, implicit-def $vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
+    ; CHECK-NEXT:   S_CLAUSE 1
+    ; CHECK-NEXT:   $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT:   $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT: }
+    $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+    $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+...
+
+---
+name: mimg_nsa_mixed
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-LABEL: name: mimg_nsa_mixed
+    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 {
+    ; CHECK-NEXT:   S_CLAUSE 2
+    ; CHECK-NEXT:   $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT:   $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx11 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+    ; CHECK-NEXT:   $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT: }
+    $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+    $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx11 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+    $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+...
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
new file mode 100644
index 000000000000..243a84562ab3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
@@ -0,0 +1,40 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s
+
+---
+name: mimg
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-LABEL: name: mimg
+    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr10_vgpr11, implicit-def $vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
+    ; CHECK-NEXT:   S_CLAUSE 1
+    ; CHECK-NEXT:   $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT:   $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT: }
+    $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+    $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+...
+
+---
+name: mimg_mixed
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-LABEL: name: mimg_mixed
+    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 {
+    ; CHECK-NEXT:   S_CLAUSE 2
+    ; CHECK-NEXT:   $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT:   $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+    ; CHECK-NEXT:   $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
+    ; CHECK-NEXT: }
+    $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+    $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+    $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
+...
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
index 1c6bdff51015..44b988a7121c 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
@@ -1,6 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefix=GFX11
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefix=GFX12
 
 ---
 name: nop1
@@ -19,6 +20,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     ; GFX11-NEXT: S_NOP 2
+    ;
+    ; GFX12-LABEL: name: nop1
+    ; GFX12: liveins: $sgpr0_sgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    ; GFX12-NEXT: S_NOP 2
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     S_NOP 2
 ...
@@ -48,6 +55,16 @@ body: |
     ; GFX11-NEXT:   S_NOP 2
     ; GFX11-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
     ; GFX11-NEXT: }
+    ;
+    ; GFX12-LABEL: name: nop2
+    ; GFX12: liveins: $sgpr0_sgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1 {
+    ; GFX12-NEXT:   S_CLAUSE 2
+    ; GFX12-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    ; GFX12-NEXT:   S_NOP 2
+    ; GFX12-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
+    ; GFX12-NEXT: }
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     S_NOP 2
     $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
@@ -80,6 +97,17 @@ body: |
     ; GFX11-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
     ; GFX11-NEXT: }
     ; GFX11-NEXT: S_NOP 2
+    ;
+    ; GFX12-LABEL: name: nop3
+    ; GFX12: liveins: $sgpr0_sgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1 {
+    ; GFX12-NEXT:   S_CLAUSE 2
+    ; GFX12-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    ; GFX12-NEXT:   S_NOP 2
+    ; GFX12-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
+    ; GFX12-NEXT: }
+    ; GFX12-NEXT: S_NOP 2
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     S_NOP 2
     $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
@@ -274,6 +302,99 @@ body: |
     ; GFX11-NEXT:   $vgpr79 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 316, 0, 0, implicit $exec
     ; GFX11-NEXT:   $vgpr80 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 320, 0, 0, implicit $exec
     ; GFX11-NEXT: }
+    ;
+    ; GFX12-LABEL: name: long_clause
+    ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit-def $vgpr5, implicit-def $vgpr5_lo16, implicit-def $vgpr5_hi16, implicit-def $vgpr6, implicit-def $vgpr6_lo16, implicit-def $vgpr6_hi16, implicit-def $vgpr7, implicit-def $vgpr7_lo16, implicit-def $vgpr7_hi16, implicit-def $vgpr8, implicit-def $vgpr8_lo16, implicit-def $vgpr8_hi16, implicit-def $vgpr9, implicit-def $vgpr9_lo16, implicit-def $vgpr9_hi16, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit-def $vgpr17, implicit-def $vgpr17_lo16, implicit-def $vgpr17_hi16, implicit-def $vgpr18, implicit-def $vgpr18_lo16, implicit-def $vgpr18_hi16, implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr24, implicit-def $vgpr24_lo16, implicit-def $vgpr24_hi16, implicit-def $vgpr25, implicit-def $vgpr25_lo16, implicit-def $vgpr25_hi16, implicit-def $vgpr26, implicit-def $vgpr26_lo16, implicit-def $vgpr26_hi16, implicit-def $vgpr27, implicit-def $vgpr27_lo16, implicit-def $vgpr27_hi16, implicit-def $vgpr28, implicit-def $vgpr28_lo16, implicit-def $vgpr28_hi16, implicit-def $vgpr29, implicit-def $vgpr29_lo16, implicit-def $vgpr29_hi16, implicit-def $vgpr30, implicit-def $vgpr30_lo16, implicit-def $vgpr30_hi16, implicit-def $vgpr31, implicit-def $vgpr31_lo16, implicit-def $vgpr31_hi16, implicit-def $vgpr32, implicit-def $vgpr32_lo16, implicit-def $vgpr32_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+    ; GFX12-NEXT:   S_CLAUSE 31
+    ; GFX12-NEXT:   $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr6 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr7 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr8 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr9 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr10 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr11 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr12 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr13 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr14 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr15 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr16 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr17 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr18 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr19 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr20 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr21 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr22 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr23 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr24 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr25 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr26 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr27 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr28 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr29 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr30 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr31 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr32 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec
+    ; GFX12-NEXT: }
+    ; GFX12-NEXT: BUNDLE implicit-def $vgpr33, implicit-def $vgpr33_lo16, implicit-def $vgpr33_hi16, implicit-def $vgpr34, implicit-def $vgpr34_lo16, implicit-def $vgpr34_hi16, implicit-def $vgpr35, implicit-def $vgpr35_lo16, implicit-def $vgpr35_hi16, implicit-def $vgpr36, implicit-def $vgpr36_lo16, implicit-def $vgpr36_hi16, implicit-def $vgpr37, implicit-def $vgpr37_lo16, implicit-def $vgpr37_hi16, implicit-def $vgpr38, implicit-def $vgpr38_lo16, implicit-def $vgpr38_hi16, implicit-def $vgpr39, implicit-def $vgpr39_lo16, implicit-def $vgpr39_hi16, implicit-def $vgpr40, implicit-def $vgpr40_lo16, implicit-def $vgpr40_hi16, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+    ; GFX12-NEXT:   S_CLAUSE 31
+    ; GFX12-NEXT:   $vgpr33 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr34 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr35 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 140, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr36 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 144, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr37 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 148, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr38 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 152, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr39 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 156, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr40 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 160, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr41 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 164, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr42 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 168, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr43 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 172, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr44 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 176, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr45 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 180, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr46 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 184, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr47 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 188, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr48 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 192, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr49 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 196, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr50 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 200, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr51 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 204, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr52 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 208, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr53 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 212, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr54 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 216, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr55 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 220, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr56 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 224, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr57 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 228, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr58 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 232, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr59 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 236, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr60 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 240, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr61 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 244, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr62 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 248, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr63 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 252, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr64 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec
+    ; GFX12-NEXT: }
+    ; GFX12-NEXT: BUNDLE implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+    ; GFX12-NEXT:   S_CLAUSE 15
+    ; GFX12-NEXT:   $vgpr65 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr66 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 264, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr67 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 268, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr68 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 272, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr69 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 276, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr70 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 280, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr71 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 284, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr72 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 288, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr73 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 292, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr74 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 296, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr75 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 300, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr76 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 304, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr77 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 308, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr78 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 312, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr79 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 316, 0, 0, implicit $exec
+    ; GFX12-NEXT:   $vgpr80 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 320, 0, 0, implicit $exec
+    ; GFX12-NEXT: }
     $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec
     $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec
     $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec
@@ -357,57 +478,6 @@ body: |
 ...
 
 ---
-name: mimg_nsa
-tracksRegLiveness: true
-body: |
-  bb.0:
-    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; CHECK-LABEL: name: mimg_nsa
-    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ; CHECK-NEXT: $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ;
-    ; GFX11-LABEL: name: mimg_nsa
-    ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr10_vgpr11, implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr11_vgpr12, implicit-def $vgpr11_vgpr12_vgpr13, implicit-def $vgpr12_vgpr13, implicit-def $vgpr20_vgpr21_vgpr22_vgpr23, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr21_vgpr22, implicit-def $vgpr21_vgpr22_vgpr23, implicit-def $vgpr22_vgpr23, implicit $vgpr3, implicit $vgpr8, implicit $vgpr7, implicit $vgpr5, implicit $vgpr4, implicit $vgpr6, implicit $vgpr0, implicit $vgpr2, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
-    ; GFX11-NEXT:   S_CLAUSE 1
-    ; GFX11-NEXT:   $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ; GFX11-NEXT:   $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ; GFX11-NEXT: }
-    $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
-    $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
-...
-
----
-name: mimg_nsa_mixed
-tracksRegLiveness: true
-body: |
-  bb.0:
-    liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; CHECK-LABEL: name: mimg_nsa_mixed
-    ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
-    ; CHECK-NEXT: $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ;
-    ; GFX11-LABEL: name: mimg_nsa_mixed
-    ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr10_vgpr11, implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr11_vgpr12, implicit-def $vgpr11_vgpr12_vgpr13, implicit-def $vgpr12_vgpr13, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr20_vgpr21_vgpr22_vgpr23, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr21_vgpr22, implicit-def $vgpr21_vgpr22_vgpr23, implicit-def $vgpr22_vgpr23, implicit $vgpr3, implicit $vgpr8, implicit $vgpr7, implicit $vgpr5, implicit $vgpr4, implicit $vgpr6, implicit $vgpr0, implicit $vgpr2, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 {
-    ; GFX11-NEXT:   S_CLAUSE 2
-    ; GFX11-NEXT:   $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ; GFX11-NEXT:   $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
-    ; GFX11-NEXT:   $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
-    ; GFX11-NEXT: }
-    $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
-    $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
-    $vgpr20_vgpr21_vgpr22_vgpr23 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 $vgpr3, $vgpr8, $vgpr7, $vgpr5, $vgpr4, $vgpr6, $vgpr0, $vgpr2, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, implicit $exec  :: (load (s128))
-...
-
----
 name: kill
 tracksRegLiveness: true
 body: |
@@ -432,6 +502,16 @@ body: |
     ; GFX11-NEXT:   KILL undef renamable $sgpr4
     ; GFX11-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
     ; GFX11-NEXT: }
+    ;
+    ; GFX12-LABEL: name: kill
+    ; GFX12: liveins: $sgpr0_sgpr1, $sgpr4
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
+    ; GFX12-NEXT:   S_CLAUSE 1
+    ; GFX12-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    ; GFX12-NEXT:   KILL undef renamable $sgpr4
+    ; GFX12-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
+    ; GFX12-NEXT: }
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     KILL undef renamable $sgpr4
     $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
@@ -464,6 +544,17 @@ body: |
     ; GFX11-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
     ; GFX11-NEXT: }
     ; GFX11-NEXT: KILL undef renamable $sgpr5
+    ;
+    ; GFX12-LABEL: name: kill2
+    ; GFX12: liveins: $sgpr0_sgpr1, $sgpr4, $sgpr5
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
+    ; GFX12-NEXT:   S_CLAUSE 1
+    ; GFX12-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    ; GFX12-NEXT:   KILL undef renamable $sgpr4
+    ; GFX12-NEXT:   $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
+    ; GFX12-NEXT: }
+    ; GFX12-NEXT: KILL undef renamable $sgpr5
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     KILL undef renamable $sgpr4
     $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
@@ -490,6 +581,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX11-NEXT: $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
+    ;
+    ; GFX12-LABEL: name: flat_load_atomic
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; GFX12-NEXT: $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
     $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
 ...
@@ -514,6 +611,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     ; GFX11-NEXT: $vgpr4 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
+    ;
+    ; GFX12-LABEL: name: global_load_atomic
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX12-NEXT: $vgpr4 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
     $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     $vgpr4 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
 ...
@@ -535,6 +638,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX11-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $flat_scr
+    ;
+    ; GFX12-LABEL: name: flat_global_load
+    ; GFX12: liveins: $vgpr0_vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; GFX12-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $flat_scr
     $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $flat_scr
 ...
@@ -559,6 +668,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     ; GFX11-NEXT: $vgpr0 = BUFFER_ATOMIC_ADD_OFFSET_RTN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 4, 0, 0, implicit $exec
+    ;
+    ; GFX12-LABEL: name: buffer_load_atomic
+    ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+    ; GFX12-NEXT: $vgpr0 = BUFFER_ATOMIC_ADD_OFFSET_RTN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 4, 0, 0, implicit $exec
     $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     $vgpr0 = BUFFER_ATOMIC_ADD_OFFSET_RTN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 4, 0, 0, implicit $exec
 ...
@@ -580,6 +695,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX11-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
+    ;
+    ; GFX12-LABEL: name: flat_load_store
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; GFX12-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
     $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
 ...
@@ -601,6 +722,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     ; GFX11-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
+    ;
+    ; GFX12-LABEL: name: global_load_store
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
     $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
 ...
@@ -622,6 +749,12 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     ; GFX11-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec
+    ;
+    ; GFX12-LABEL: name: buffer_load_store
+    ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+    ; GFX12-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec
     $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
index e21d61036375..ffedde9416bb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
@@ -97,9 +97,7 @@ define amdgpu_cs void @test_cvt_sr_bf8_f32_byte0(i32 %a, i32 %r, i32 %old, ptr a
 define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX12-NEXT:    v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX12-NEXT:    global_store_b32 v[3:4], v2, off
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -114,9 +112,7 @@ define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr a
 define amdgpu_cs void @test_cvt_sr_fp8_f32_byte2(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GFX12-NEXT:    v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX12-NEXT:    global_store_b32 v[3:4], v2, off
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
index 9b8fdf901704..7662a3b78dea 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
@@ -385,7 +385,7 @@ define i32 @test_cvt_sr_bf8_f32_byte1(float %x, i32 %r, i32 %old) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 byte_sel:1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
@@ -409,7 +409,7 @@ define i32 @test_cvt_sr_bf8_f32_byte2(float %x, i32 %r, i32 %old) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 byte_sel:2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
@@ -433,7 +433,7 @@ define i32 @test_cvt_sr_bf8_f32_byte3(float %x, i32 %r, i32 %old) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1]
+; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 byte_sel:3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
@@ -479,7 +479,7 @@ define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
@@ -503,7 +503,7 @@ define i32 @test_cvt_sr_fp8_f32_byte2(float %x, i32 %r, i32 %old) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
@@ -527,7 +527,7 @@ define i32 @test_cvt_sr_fp8_f32_byte3(float %x, i32 %r, i32 %old) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1]
+; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/pal-userdata-regs.ll b/llvm/test/CodeGen/AMDGPU/pal-userdata-regs.ll
index 6d043e2b6b0a..591deda611b2 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-userdata-regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-userdata-regs.ll
@@ -4,12 +4,12 @@
 ; full tessellation-and-geometry pipeline, compiled on gfx8 so it uses all six
 ; hardware shader types.
 
-; CHECK-DAG: 0x2c0c (SPI_SHADER_USER_DATA_PS_0): 0x10000000
-; CHECK-DAG: 0x2c4c (SPI_SHADER_USER_DATA_VS_0): 0x10000000
-; CHECK-DAG: 0x2c8c (SPI_SHADER_USER_DATA_GS_0): 0x10000000
-; CHECK-DAG: 0x2ccc (SPI_SHADER_USER_DATA_ES_0): 0x10000000
-; CHECK-DAG: 0x2d0c (SPI_SHADER_USER_DATA_HS_0): 0x10000000
-; CHECK-DAG: 0x2d4c (SPI_SHADER_USER_DATA_LS_0): 0x10000000
+; CHECK-DAG: '0x2c0c (SPI_SHADER_USER_DATA_PS_0)': 0x10000000
+; CHECK-DAG: '0x2c4c (SPI_SHADER_USER_DATA_VS_0)': 0x10000000
+; CHECK-DAG: '0x2c8c (SPI_SHADER_USER_DATA_GS_0)': 0x10000000
+; CHECK-DAG: '0x2ccc (SPI_SHADER_USER_DATA_ES_0)': 0x10000000
+; CHECK-DAG: '0x2d0c (SPI_SHADER_USER_DATA_HS_0)': 0x10000000
+; CHECK-DAG: '0x2d4c (SPI_SHADER_USER_DATA_LS_0)': 0x10000000
 
 !amdgpu.pal.metadata.msgpack = !{!0}
 
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index 6ba66ccf7186..b068d87c4d6f 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -242,130 +242,137 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0:       ; %bb.0: ; %_udiv-special-cases
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v2
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v3
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v15
-; GFX9-O0-NEXT:    v_ashrrev_i64 v[12:13], s4, v[6:7]
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[11:12], s4, v[10:11]
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v13
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
-; GFX9-O0-NEXT:    v_ashrrev_i64 v[6:7], s4, v[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v13
-; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v1, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v14
-; GFX9-O0-NEXT:    v_xor_b32_e64 v13, v11, v12
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v6
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[15:16], s4, v[13:14]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v12
+; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v11
+; GFX9-O0-NEXT:    v_xor_b32_e64 v13, v8, v12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v1, v10
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_xor_b32_e64 v15, v4, v12
-; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v7
-; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v1, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v5, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v10
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v1, v12
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v16
+; GFX9-O0-NEXT:    v_xor_b32_e64 v9, v8, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v15
+; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v7, v6
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v1, v4
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_xor_b32_e64 v2, v2, v6
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
+; GFX9-O0-NEXT:    v_xor_b32_e64 v9, v9, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v14
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v9, vcc, v9, v12
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v10, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v13, vcc, v11, v12, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v10, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
-; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v6
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v3, vcc
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v5, v6, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v3, vcc, v2, v3, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -438,7 +445,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[11:12], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[12:13]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
@@ -474,7 +482,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[13:14], s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
@@ -589,27 +598,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v0, 5
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:  ; %bb.2: ; %Flow
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_5
 ; GFX9-O0-NEXT:  .LBB0_3: ; %Flow2
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
@@ -624,22 +633,22 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_9
 ; GFX9-O0-NEXT:  .LBB0_4: ; %udiv-loop-exit
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 1
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[0:1]
@@ -679,27 +688,27 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 6
 ; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 7
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_4
 ; GFX9-O0-NEXT:  .LBB0_6: ; %udiv-do-while
 ; GFX9-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -709,30 +718,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_readlane_b32 s6, v16, 8
 ; GFX9-O0-NEXT:    v_readlane_b32 s7, v16, 9
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[29:30], s4, v[2:3]
@@ -872,24 +881,24 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v2
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v0
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v15
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v14
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v13
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v12
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 4
 ; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 5
@@ -899,42 +908,42 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    s_cbranch_execnz .LBB0_6
 ; GFX9-O0-NEXT:    s_branch .LBB0_1
 ; GFX9-O0-NEXT:  .LBB0_7: ; %udiv-preheader
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
@@ -1018,12 +1027,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s8
@@ -1034,30 +1043,30 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_6
 ; GFX9-O0-NEXT:  .LBB0_8: ; %udiv-bb1
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
@@ -1099,14 +1108,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0x7f
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v3, s4, v4
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[5:6], v3, v[11:12]
@@ -1152,12 +1161,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
 ; GFX9-O0-NEXT:    v_or_b32_e64 v3, v3, v4
@@ -1172,18 +1181,18 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
@@ -1203,18 +1212,18 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 32
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[5:6]
@@ -1486,11 +1495,11 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX9-O0-NEXT:    ; kill: killed $vgpr4
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll b/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
index e73235857728..29520cb7468c 100644
--- a/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
@@ -21,10 +21,10 @@
 ; VI-NEXT:          .vgpr_count:     0x5
 ; GFX9-NEXT:        .vgpr_count:     0x5
 ; GCN-NEXT:     .registers:
-; SI-NEXT:        0x2e12 (COMPUTE_PGM_RSRC1): 0x{{[0-9a-f]*}}81
-; VI-NEXT:        0x2e12 (COMPUTE_PGM_RSRC1): 0x{{[0-9a-f]*}}c1
-; GFX9-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0x{{[0-9a-f]*}}81
-; GCN-NEXT:       0x2e13 (COMPUTE_PGM_RSRC2): 0
+; SI-NEXT:        '0x2e12 (COMPUTE_PGM_RSRC1)': 0x{{[0-9a-f]*}}81
+; VI-NEXT:        '0x2e12 (COMPUTE_PGM_RSRC1)': 0x{{[0-9a-f]*}}c1
+; GFX9-NEXT:      '0x2e12 (COMPUTE_PGM_RSRC1)': 0x{{[0-9a-f]*}}81
+; GCN-NEXT:       '0x2e13 (COMPUTE_PGM_RSRC2)': 0
 ; GCN-NEXT: ...
 ; GCN-NEXT:         .end_amdgpu_pal_metadata
 
diff --git a/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll b/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll
index be741f536ac7..528bfe041173 100644
--- a/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll
+++ b/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll
@@ -46,12 +46,10 @@ define float @fminnum32_intrinsic(float %x, float %y) {
 define float @fminnum32_nsz_intrinsic(float %x, float %y) {
 ; ARMV7-LABEL: fminnum32_nsz_intrinsic:
 ; ARMV7:       @ %bb.0:
-; ARMV7-NEXT:    vmov s0, r0
-; ARMV7-NEXT:    vmov s2, r1
-; ARMV7-NEXT:    vcmp.f32 s0, s2
-; ARMV7-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT:    vmovlt.f32 s2, s0
-; ARMV7-NEXT:    vmov r0, s2
+; ARMV7-NEXT:    vmov s0, r1
+; ARMV7-NEXT:    vmov s2, r0
+; ARMV7-NEXT:    vmin.f32 d0, d1, d0
+; ARMV7-NEXT:    vmov r0, s0
 ; ARMV7-NEXT:    bx lr
 ;
 ; ARMV8-LABEL: fminnum32_nsz_intrinsic:
@@ -78,9 +76,7 @@ define float @fminnum32_non_zero_intrinsic(float %x) {
 ; ARMV7:       @ %bb.0:
 ; ARMV7-NEXT:    vmov.f32 s0, #-1.000000e+00
 ; ARMV7-NEXT:    vmov s2, r0
-; ARMV7-NEXT:    vcmp.f32 s2, s0
-; ARMV7-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT:    vmovlt.f32 s0, s2
+; ARMV7-NEXT:    vmin.f32 d0, d1, d0
 ; ARMV7-NEXT:    vmov r0, s0
 ; ARMV7-NEXT:    bx lr
 ;
@@ -136,12 +132,10 @@ define float @fmaxnum32_intrinsic(float %x, float %y) {
 define float @fmaxnum32_nsz_intrinsic(float %x, float %y) {
 ; ARMV7-LABEL: fmaxnum32_nsz_intrinsic:
 ; ARMV7:       @ %bb.0:
-; ARMV7-NEXT:    vmov s0, r0
-; ARMV7-NEXT:    vmov s2, r1
-; ARMV7-NEXT:    vcmp.f32 s0, s2
-; ARMV7-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT:    vmovgt.f32 s2, s0
-; ARMV7-NEXT:    vmov r0, s2
+; ARMV7-NEXT:    vmov s0, r1
+; ARMV7-NEXT:    vmov s2, r0
+; ARMV7-NEXT:    vmax.f32 d0, d1, d0
+; ARMV7-NEXT:    vmov r0, s0
 ; ARMV7-NEXT:    bx lr
 ;
 ; ARMV8-LABEL: fmaxnum32_nsz_intrinsic:
@@ -210,9 +204,7 @@ define float @fmaxnum32_non_zero_intrinsic(float %x) {
 ; ARMV7:       @ %bb.0:
 ; ARMV7-NEXT:    vmov.f32 s0, #1.000000e+00
 ; ARMV7-NEXT:    vmov s2, r0
-; ARMV7-NEXT:    vcmp.f32 s2, s0
-; ARMV7-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV7-NEXT:    vmovgt.f32 s0, s2
+; ARMV7-NEXT:    vmax.f32 d0, d1, d0
 ; ARMV7-NEXT:    vmov r0, s0
 ; ARMV7-NEXT:    bx lr
 ;
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
index 8ca8a6602737..024ed04f6e5e 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
@@ -108,8 +108,8 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   20
 ; CHECK-NEXT:        .long   20
-; CHECK-NEXT:        .long   124
-; CHECK-NEXT:        .long   144
+; CHECK-NEXT:        .long   108
+; CHECK-NEXT:        .long   128
 ; CHECK-NEXT:        .long   28
 ; CHECK-NEXT:        .long   8                       # FuncInfo
 
diff --git a/llvm/test/CodeGen/Hexagon/hexagon-copy-hoisting.mir b/llvm/test/CodeGen/Hexagon/hexagon-copy-hoisting.mir
new file mode 100644
index 000000000000..0836cac7f913
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/hexagon-copy-hoisting.mir
@@ -0,0 +1,53 @@
+# RUN: llc -march=hexagon -run-pass hexagon-move-phicopy -o - %s | FileCheck %s
+
+# CHECK-COUNT-1: %4:intregs = COPY %1
+
+# CHECK: bb.1
+# CHECK-NOT: %4:intregs = COPY %1
+
+# CHECK: bb.2
+# CHECK-NOT: %4:intregs = COPY %1
+# CHECK: %5:intregs = COPY %0
+
+---
+name:            f0
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: intregs, preferred-register: '' }
+  - { id: 1, class: intregs, preferred-register: '' }
+  - { id: 2, class: predregs, preferred-register: '' }
+  - { id: 3, class: predregs, preferred-register: '' }
+  - { id: 4, class: intregs, preferred-register: '' }
+  - { id: 5, class: intregs, preferred-register: '' }
+liveins:
+  - { reg: '$r0', virtual-reg: '%0' }
+  - { reg: '$r1', virtual-reg: '%1' }
+stack:
+  - { id: 0, offset: 0, size: 4, alignment: 8 }
+body: |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $r0, $r1
+
+    %1:intregs = COPY $r1
+    %0:intregs = COPY $r0
+    %2:predregs = C2_cmpgt %0, %1
+    %3:predregs = C2_not %2
+    J2_jumpt %3, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+
+  bb.1:
+    successors: %bb.0
+
+    %4:intregs = COPY %1
+    $r1 = COPY %4
+    J2_jump %bb.0, implicit-def dead $pc
+
+  bb.2:
+    successors: %bb.0
+
+    %4:intregs = COPY %1
+    %5:intregs = COPY %0
+    $r1 = COPY %4
+    J2_jump %bb.0, implicit-def dead $pc
+...
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
index 6629d3440549..25106b456d2f 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
@@ -123,10 +123,9 @@ define void @insert_32xi8_idx(ptr %src, ptr %dst, i8 %in, i32 %idx) nounwind {
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
 ; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT:    addi.d $a3, $sp, 0
-; CHECK-NEXT:    bstrins.d $a3, $a0, 4, 0
-; CHECK-NEXT:    st.b $a2, $a3, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 0
+; CHECK-NEXT:    st.b $a2, $a0, 0
 ; CHECK-NEXT:    xvld $xr0, $sp, 0
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $fp, -64
@@ -150,10 +149,9 @@ define void @insert_16xi16_idx(ptr %src, ptr %dst, i16 %in, i32 %idx) nounwind {
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
 ; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT:    addi.d $a3, $sp, 0
-; CHECK-NEXT:    bstrins.d $a3, $a0, 4, 1
-; CHECK-NEXT:    st.h $a2, $a3, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 1
+; CHECK-NEXT:    st.h $a2, $a0, 0
 ; CHECK-NEXT:    xvld $xr0, $sp, 0
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $fp, -64
@@ -177,10 +175,9 @@ define void @insert_8xi32_idx(ptr %src, ptr %dst, i32 %in, i32 %idx) nounwind {
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
 ; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT:    addi.d $a3, $sp, 0
-; CHECK-NEXT:    bstrins.d $a3, $a0, 4, 2
-; CHECK-NEXT:    st.w $a2, $a3, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 2
+; CHECK-NEXT:    st.w $a2, $a0, 0
 ; CHECK-NEXT:    xvld $xr0, $sp, 0
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $fp, -64
@@ -204,10 +201,9 @@ define void @insert_4xi64_idx(ptr %src, ptr %dst, i64 %in, i32 %idx) nounwind {
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
 ; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT:    addi.d $a3, $sp, 0
-; CHECK-NEXT:    bstrins.d $a3, $a0, 4, 3
-; CHECK-NEXT:    st.d $a2, $a3, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 3
+; CHECK-NEXT:    st.d $a2, $a0, 0
 ; CHECK-NEXT:    xvld $xr0, $sp, 0
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $fp, -64
@@ -231,10 +227,9 @@ define void @insert_8xfloat_idx(ptr %src, ptr %dst, float %in, i32 %idx) nounwin
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr1, $a0, 0
 ; CHECK-NEXT:    xvst $xr1, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a2, 31, 0
-; CHECK-NEXT:    addi.d $a2, $sp, 0
-; CHECK-NEXT:    bstrins.d $a2, $a0, 4, 2
-; CHECK-NEXT:    fst.s $fa0, $a2, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
+; CHECK-NEXT:    fst.s $fa0, $a0, 0
 ; CHECK-NEXT:    xvld $xr0, $sp, 0
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $fp, -64
@@ -258,10 +253,9 @@ define void @insert_4xdouble_idx(ptr %src, ptr %dst, double %in, i32 %idx) nounw
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr1, $a0, 0
 ; CHECK-NEXT:    xvst $xr1, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a2, 31, 0
-; CHECK-NEXT:    addi.d $a2, $sp, 0
-; CHECK-NEXT:    bstrins.d $a2, $a0, 4, 3
-; CHECK-NEXT:    fst.d $fa0, $a2, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 3
+; CHECK-NEXT:    fst.d $fa0, $a0, 0
 ; CHECK-NEXT:    xvld $xr0, $sp, 0
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $fp, -64
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
index 19171b7d8ed7..7f232073ae12 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
@@ -87,10 +87,9 @@ define void @insert_16xi8_idx(ptr %src, ptr %dst, i8 %ins, i32 %idx) nounwind {
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    vld $vr0, $a0, 0
 ; CHECK-NEXT:    vst $vr0, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT:    addi.d $a3, $sp, 0
-; CHECK-NEXT:    bstrins.d $a3, $a0, 3, 0
-; CHECK-NEXT:    st.b $a2, $a3, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a3, 3, 0
+; CHECK-NEXT:    st.b $a2, $a0, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -107,10 +106,9 @@ define void @insert_8xi16_idx(ptr %src, ptr %dst, i16 %ins, i32 %idx) nounwind {
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    vld $vr0, $a0, 0
 ; CHECK-NEXT:    vst $vr0, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT:    addi.d $a3, $sp, 0
-; CHECK-NEXT:    bstrins.d $a3, $a0, 3, 1
-; CHECK-NEXT:    st.h $a2, $a3, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a3, 3, 1
+; CHECK-NEXT:    st.h $a2, $a0, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -127,10 +125,9 @@ define void @insert_4xi32_idx(ptr %src, ptr %dst, i32 %ins, i32 %idx) nounwind {
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    vld $vr0, $a0, 0
 ; CHECK-NEXT:    vst $vr0, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT:    addi.d $a3, $sp, 0
-; CHECK-NEXT:    bstrins.d $a3, $a0, 3, 2
-; CHECK-NEXT:    st.w $a2, $a3, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a3, 3, 2
+; CHECK-NEXT:    st.w $a2, $a0, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -147,10 +144,9 @@ define void @insert_2xi64_idx(ptr %src, ptr %dst, i64 %ins, i32 %idx) nounwind {
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    vld $vr0, $a0, 0
 ; CHECK-NEXT:    vst $vr0, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a3, 31, 0
-; CHECK-NEXT:    addi.d $a3, $sp, 0
-; CHECK-NEXT:    bstrins.d $a3, $a0, 3, 3
-; CHECK-NEXT:    st.d $a2, $a3, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a3, 3, 3
+; CHECK-NEXT:    st.d $a2, $a0, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -167,10 +163,9 @@ define void @insert_4xfloat_idx(ptr %src, ptr %dst, float %ins, i32 %idx) nounwi
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    vld $vr1, $a0, 0
 ; CHECK-NEXT:    vst $vr1, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a2, 31, 0
-; CHECK-NEXT:    addi.d $a2, $sp, 0
-; CHECK-NEXT:    bstrins.d $a2, $a0, 3, 2
-; CHECK-NEXT:    fst.s $fa0, $a2, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a2, 3, 2
+; CHECK-NEXT:    fst.s $fa0, $a0, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -187,10 +182,9 @@ define void @insert_2xdouble_idx(ptr %src, ptr %dst, double %ins, i32 %idx) noun
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    vld $vr1, $a0, 0
 ; CHECK-NEXT:    vst $vr1, $sp, 0
-; CHECK-NEXT:    bstrpick.d $a0, $a2, 31, 0
-; CHECK-NEXT:    addi.d $a2, $sp, 0
-; CHECK-NEXT:    bstrins.d $a2, $a0, 3, 3
-; CHECK-NEXT:    fst.d $fa0, $a2, 0
+; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    bstrins.d $a0, $a2, 3, 3
+; CHECK-NEXT:    fst.d $fa0, $a0, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vst $vr0, $a1, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
diff --git a/llvm/test/CodeGen/LoongArch/sextw-removal.ll b/llvm/test/CodeGen/LoongArch/sextw-removal.ll
new file mode 100644
index 000000000000..6db9c1608b3c
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/sextw-removal.ll
@@ -0,0 +1,921 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mtriple=loongarch64 | FileCheck %s --check-prefixes=CHECK
+
+define void @test1(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -32
+; CHECK-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    move $fp, $a1
+; CHECK-NEXT:    sra.w $s0, $a0, $a1
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB0_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $s0, 0
+; CHECK-NEXT:    bl %plt(bar)
+; CHECK-NEXT:    sll.w $s0, $s0, $fp
+; CHECK-NEXT:    bnez $a0, .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 32
+; CHECK-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i5, %bb2 ]
+  %i4 = tail call signext i32 @bar(i32 signext %i3)
+  %i5 = shl i32 %i3, %arg1
+  %i6 = icmp eq i32 %i4, 0
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+declare signext i32 @bar(i32 signext)
+
+define signext i32 @test2(ptr %p, i32 signext %b) nounwind {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    ori $a2, $zero, 1
+; CHECK-NEXT:    sll.w $a1, $a2, $a1
+; CHECK-NEXT:    andn $a0, $a0, $a1
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+  %a = load i32, ptr %p
+  %shl = shl i32 1, %b
+  %neg = xor i32 %shl, -1
+  %and1 = and i32 %neg, %a
+  ret i32 %and1
+}
+
+define signext i32 @test3(ptr %p, i32 signext %b) nounwind {
+; CHECK-LABEL: test3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    ori $a2, $zero, 1
+; CHECK-NEXT:    sll.w $a1, $a2, $a1
+; CHECK-NEXT:    orn $a0, $a0, $a1
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+  %a = load i32, ptr %p
+  %shl = shl i32 1, %b
+  %neg = xor i32 %shl, -1
+  %and1 = or i32 %neg, %a
+  ret i32 %and1
+}
+
+define signext i32 @test4(ptr %p, i32 signext %b) nounwind {
+; CHECK-LABEL: test4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    ori $a2, $zero, 1
+; CHECK-NEXT:    sll.w $a1, $a2, $a1
+; CHECK-NEXT:    xor $a0, $a1, $a0
+; CHECK-NEXT:    nor $a0, $a0, $zero
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+  %a = load i32, ptr %p
+  %shl = shl i32 1, %b
+  %neg = xor i32 %shl, -1
+  %and1 = xor i32 %neg, %a
+  ret i32 %and1
+}
+
+define void @test5(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test5:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -48
+; CHECK-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s2, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    sra.w $a1, $a0, $a1
+; CHECK-NEXT:    lu12i.w $a0, 349525
+; CHECK-NEXT:    ori $fp, $a0, 1365
+; CHECK-NEXT:    lu12i.w $a0, 209715
+; CHECK-NEXT:    ori $s0, $a0, 819
+; CHECK-NEXT:    lu12i.w $a0, 61680
+; CHECK-NEXT:    ori $s1, $a0, 3855
+; CHECK-NEXT:    lu12i.w $a0, 4112
+; CHECK-NEXT:    ori $s2, $a0, 257
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB4_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $a1, 0
+; CHECK-NEXT:    bl %plt(bar)
+; CHECK-NEXT:    srli.d $a1, $a0, 1
+; CHECK-NEXT:    and $a1, $a1, $fp
+; CHECK-NEXT:    sub.d $a1, $a0, $a1
+; CHECK-NEXT:    and $a2, $a1, $s0
+; CHECK-NEXT:    srli.d $a1, $a1, 2
+; CHECK-NEXT:    and $a1, $a1, $s0
+; CHECK-NEXT:    add.d $a1, $a2, $a1
+; CHECK-NEXT:    srli.d $a2, $a1, 4
+; CHECK-NEXT:    add.d $a1, $a1, $a2
+; CHECK-NEXT:    and $a1, $a1, $s1
+; CHECK-NEXT:    mul.d $a1, $a1, $s2
+; CHECK-NEXT:    bstrpick.d $a1, $a1, 31, 24
+; CHECK-NEXT:    bnez $a0, .LBB4_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $s2, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 48
+; CHECK-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i5, %bb2 ]
+  %i4 = tail call signext i32 @bar(i32 signext %i3)
+  %i5 = tail call i32 @llvm.ctpop.i32(i32 %i4)
+  %i6 = icmp eq i32 %i4, 0
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+declare i32 @llvm.ctpop.i32(i32)
+
+define void @test6(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test6:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -32
+; CHECK-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    sra.w $fp, $a0, $a1
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB5_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $fp, 0
+; CHECK-NEXT:    bl %plt(baz)
+; CHECK-NEXT:    bstrpick.d $s0, $a0, 31, 0
+; CHECK-NEXT:    move $a0, $s0
+; CHECK-NEXT:    bl %plt(__fixsfsi)
+; CHECK-NEXT:    move $fp, $a0
+; CHECK-NEXT:    move $a0, $s0
+; CHECK-NEXT:    move $a1, $zero
+; CHECK-NEXT:    bl %plt(__nesf2)
+; CHECK-NEXT:    bnez $a0, .LBB5_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 32
+; CHECK-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i5, %bb2 ]
+  %i4 = tail call float @baz(i32 signext %i3)
+  %i5 = fptosi float %i4 to i32
+  %i6 = fcmp oeq float %i4, zeroinitializer
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+declare float @baz(i32 signext %i3)
+
+define void @test7(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test7:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -48
+; CHECK-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s1, $sp, 16 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s2, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    sra.w $a0, $a0, $a1
+; CHECK-NEXT:    lu12i.w $a1, 349525
+; CHECK-NEXT:    ori $a1, $a1, 1365
+; CHECK-NEXT:    lu32i.d $a1, 349525
+; CHECK-NEXT:    lu52i.d $fp, $a1, 1365
+; CHECK-NEXT:    lu12i.w $a1, 209715
+; CHECK-NEXT:    ori $a1, $a1, 819
+; CHECK-NEXT:    lu32i.d $a1, 209715
+; CHECK-NEXT:    lu52i.d $s0, $a1, 819
+; CHECK-NEXT:    lu12i.w $a1, 61680
+; CHECK-NEXT:    ori $a1, $a1, 3855
+; CHECK-NEXT:    lu32i.d $a1, -61681
+; CHECK-NEXT:    lu52i.d $s1, $a1, 240
+; CHECK-NEXT:    lu12i.w $a1, 4112
+; CHECK-NEXT:    ori $a1, $a1, 257
+; CHECK-NEXT:    lu32i.d $a1, 65793
+; CHECK-NEXT:    lu52i.d $s2, $a1, 16
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB6_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    bl %plt(foo)
+; CHECK-NEXT:    srli.d $a1, $a0, 1
+; CHECK-NEXT:    and $a1, $a1, $fp
+; CHECK-NEXT:    sub.d $a0, $a0, $a1
+; CHECK-NEXT:    and $a1, $a0, $s0
+; CHECK-NEXT:    srli.d $a0, $a0, 2
+; CHECK-NEXT:    and $a0, $a0, $s0
+; CHECK-NEXT:    add.d $a0, $a1, $a0
+; CHECK-NEXT:    srli.d $a1, $a0, 4
+; CHECK-NEXT:    add.d $a0, $a0, $a1
+; CHECK-NEXT:    and $a0, $a0, $s1
+; CHECK-NEXT:    mul.d $a0, $a0, $s2
+; CHECK-NEXT:    srli.d $a0, $a0, 56
+; CHECK-NEXT:    bnez $a0, .LBB6_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $s2, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 48
+; CHECK-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i6, %bb2 ]
+  %i4 = tail call signext i64 @foo(i32 signext %i3)
+  %i5 = tail call i64 @llvm.ctpop.i64(i64 %i4)
+  %i6 = trunc i64 %i5 to i32
+  %i7 = icmp eq i32 %i6, 0
+  br i1 %i7, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+declare i64 @llvm.ctpop.i64(i64)
+
+define void @test8(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test8:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -16
+; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; CHECK-NEXT:    sra.w $a0, $a0, $a1
+; CHECK-NEXT:    addi.w $fp, $zero, -256
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB7_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    bl %plt(foo)
+; CHECK-NEXT:    or $a0, $a0, $fp
+; CHECK-NEXT:    bnez $a0, .LBB7_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 16
+; CHECK-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i6, %bb2 ]
+  %i4 = tail call signext i64 @foo(i32 signext %i3)
+  %i5 = or i64 %i4, -256
+  %i6 = trunc i64 %i5 to i32
+  %i7 = icmp eq i32 %i6, 0
+  br i1 %i7, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+declare i64 @foo(i32 signext)
+
+define void @test9(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test9:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -16
+; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; CHECK-NEXT:    sra.w $a1, $a0, $a1
+; CHECK-NEXT:    ori $fp, $zero, 254
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB8_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $a1, 0
+; CHECK-NEXT:    bl %plt(bar)
+; CHECK-NEXT:    slti $a1, $a0, 255
+; CHECK-NEXT:    blt $fp, $a0, .LBB8_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 16
+; CHECK-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i7, %bb2 ]
+  %i4 = tail call signext i32 @bar(i32 signext %i3)
+  %i5 = icmp slt i32 %i4, 255
+  %i6 = sext i1 %i5 to i32
+  %i7 = sub i32 0, %i6
+  br i1 %i5, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+define void @test10(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test10:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -16
+; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; CHECK-NEXT:    sra.w $fp, $a0, $a1
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB9_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $fp, 0
+; CHECK-NEXT:    bl %plt(baz)
+; CHECK-NEXT:    move $fp, $a0
+; CHECK-NEXT:    bstrpick.d $a0, $a0, 31, 0
+; CHECK-NEXT:    move $a1, $zero
+; CHECK-NEXT:    bl %plt(__nesf2)
+; CHECK-NEXT:    bnez $a0, .LBB9_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 16
+; CHECK-NEXT:    ret
+bb:
+  %i = ashr i32 %arg, %arg1
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i5, %bb2 ]
+  %i4 = tail call float @baz(i32 signext %i3)
+  %i5 = bitcast float %i4 to i32
+  %i6 = fcmp oeq float %i4, zeroinitializer
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+define signext i32 @test11(i64 %arg1, i64 %arg2, i64 %arg3)  {
+; CHECK-LABEL: test11:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $a2, $a2, -1
+; CHECK-NEXT:    ori $a3, $zero, 256
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB10_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    andi $a0, $a0, 1234
+; CHECK-NEXT:    addi.d $a2, $a2, 1
+; CHECK-NEXT:    add.d $a0, $a0, $a1
+; CHECK-NEXT:    bltu $a2, $a3, .LBB10_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %entry
+  %i1 = phi i64 [ %arg1, %entry ], [ %i5, %bb2 ]
+  %i2 = phi i64 [ %arg3, %entry ], [ %i3, %bb2 ]
+  %i3 = add i64 %i2, 1
+  %i4 = and i64 %i1, 1234
+  %i5 = add i64 %i4, %arg2
+  %i6 = icmp ugt i64 %i2, 255
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  %i7 = trunc i64 %i5 to i32
+  ret i32 %i7
+}
+
+define signext i32 @test12(i64 %arg1, i64 %arg2, i64 %arg3)  {
+; CHECK-LABEL: test12:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $a2, $a2, -1
+; CHECK-NEXT:    ori $a3, $zero, 256
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB11_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    xor $a0, $a0, $a1
+; CHECK-NEXT:    mul.d $a4, $a0, $a1
+; CHECK-NEXT:    add.d $a0, $a0, $a4
+; CHECK-NEXT:    and $a4, $a4, $a0
+; CHECK-NEXT:    addi.d $a2, $a2, 1
+; CHECK-NEXT:    add.d $a0, $a4, $a1
+; CHECK-NEXT:    bltu $a2, $a3, .LBB11_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    addi.w $a0, $a4, 0
+; CHECK-NEXT:    ret
+entry:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %entry
+  %i1 = phi i64 [ %arg1, %entry ], [ %i6, %bb2 ]
+  %i2 = phi i64 [ %arg3, %entry ], [ %i3, %bb2 ]
+  %i3 = add i64 %i2, 1
+  %i4 = xor i64 %i1, %arg2
+  %i5 = mul i64 %i4, %arg2
+  %i9 = add i64 %i4, %i5
+  %i8 = and i64 %i5, %i9
+  %i6 = add i64 %i8, %arg2
+  %i7 = icmp ugt i64 %i2, 255
+  br i1 %i7, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  %r = trunc i64 %i8 to i32
+  ret i32 %r
+}
+
+define signext i32 @test13(i64 %arg1, i64 %arg2, i64 %arg3)  {
+; CHECK-LABEL: test13:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $a2, $a2, -1
+; CHECK-NEXT:    ori $a3, $zero, 256
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB12_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    div.d $a0, $a0, $a1
+; CHECK-NEXT:    addi.d $a2, $a2, 1
+; CHECK-NEXT:    add.d $a0, $a0, $a1
+; CHECK-NEXT:    bltu $a2, $a3, .LBB12_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %entry
+  %i1 = phi i64 [ %arg1, %entry ], [ %i5, %bb2 ]
+  %i2 = phi i64 [ %arg3, %entry ], [ %i3, %bb2 ]
+  %i3 = add i64 %i2, 1
+  %i4 = sdiv i64 %i1, %arg2
+  %i5 = add i64 %i4, %arg2
+  %i6 = icmp ugt i64 %i2, 255
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  %i8 = trunc i64 %i5 to i32
+  ret i32 %i8
+}
+
+
+define signext i32 @test14(i32 signext %0, i32 signext %1) {
+; CHECK-LABEL: test14:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori $a2, $zero, 2
+; CHECK-NEXT:    blt $a1, $a2, .LBB13_4
+; CHECK-NEXT:  # %bb.1: # %.preheader
+; CHECK-NEXT:    ori $a3, $zero, 1
+; CHECK-NEXT:    addi.w $a2, $zero, -1
+; CHECK-NEXT:    lu32i.d $a2, 0
+; CHECK-NEXT:    ori $a4, $zero, 1000
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB13_2: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a5, $a0, 0
+; CHECK-NEXT:    blt $a4, $a5, .LBB13_5
+; CHECK-NEXT:  # %bb.3: # in Loop: Header=BB13_2 Depth=1
+; CHECK-NEXT:    add.d $a0, $a3, $a0
+; CHECK-NEXT:    addi.w $a3, $a3, 1
+; CHECK-NEXT:    blt $a3, $a1, .LBB13_2
+; CHECK-NEXT:  .LBB13_4:
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB13_5:
+; CHECK-NEXT:    addi.w $a0, $a2, 0
+; CHECK-NEXT:    ret
+  %3 = icmp sgt i32 %1, 1
+  br i1 %3, label %4, label %12
+
+4:                                                ; preds = %2, %8
+  %5 = phi i32 [ %10, %8 ], [ 1, %2 ]
+  %6 = phi i32 [ %9, %8 ], [ %0, %2 ]
+  %7 = icmp sgt i32 %6, 1000
+  br i1 %7, label %12, label %8
+
+8:                                                ; preds = %4
+  %9 = add nsw i32 %5, %6
+  %10 = add nuw nsw i32 %5, 1
+  %11 = icmp slt i32 %10, %1
+  br i1 %11, label %4, label %12
+
+12:                                               ; preds = %8, %4, %2
+  %13 = phi i32 [ %0, %2 ], [ -1, %4 ], [ %9, %8 ]
+  ret i32 %13
+}
+
+define signext i32 @test14b(i32 %0, i32 signext %1) {
+; CHECK-LABEL: test14b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori $a2, $zero, 2
+; CHECK-NEXT:    blt $a1, $a2, .LBB14_4
+; CHECK-NEXT:  # %bb.1: # %.preheader
+; CHECK-NEXT:    ori $a3, $zero, 1
+; CHECK-NEXT:    addi.w $a2, $zero, -1
+; CHECK-NEXT:    lu32i.d $a2, 0
+; CHECK-NEXT:    ori $a4, $zero, 1000
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB14_2: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a5, $a0, 0
+; CHECK-NEXT:    blt $a4, $a5, .LBB14_5
+; CHECK-NEXT:  # %bb.3: # in Loop: Header=BB14_2 Depth=1
+; CHECK-NEXT:    add.d $a0, $a3, $a0
+; CHECK-NEXT:    addi.w $a3, $a3, 1
+; CHECK-NEXT:    blt $a3, $a1, .LBB14_2
+; CHECK-NEXT:  .LBB14_4:
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB14_5:
+; CHECK-NEXT:    addi.w $a0, $a2, 0
+; CHECK-NEXT:    ret
+  %3 = icmp sgt i32 %1, 1
+  br i1 %3, label %4, label %12
+
+4:                                                ; preds = %2, %8
+  %5 = phi i32 [ %10, %8 ], [ 1, %2 ]
+  %6 = phi i32 [ %9, %8 ], [ %0, %2 ]
+  %7 = icmp sgt i32 %6, 1000
+  br i1 %7, label %12, label %8
+
+8:                                                ; preds = %4
+  %9 = add nsw i32 %5, %6
+  %10 = add nuw nsw i32 %5, 1
+  %11 = icmp slt i32 %10, %1
+  br i1 %11, label %4, label %12
+
+12:                                               ; preds = %8, %4, %2
+  %13 = phi i32 [ %0, %2 ], [ -1, %4 ], [ %9, %8 ]
+  ret i32 %13
+}
+
+define signext i32 @test14c(i32 zeroext %0, i32 signext %1) {
+; CHECK-LABEL: test14c:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori $a2, $zero, 2
+; CHECK-NEXT:    blt $a1, $a2, .LBB15_4
+; CHECK-NEXT:  # %bb.1: # %.preheader
+; CHECK-NEXT:    ori $a3, $zero, 1
+; CHECK-NEXT:    addi.w $a2, $zero, -1
+; CHECK-NEXT:    lu32i.d $a2, 0
+; CHECK-NEXT:    ori $a4, $zero, 1000
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB15_2: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a5, $a0, 0
+; CHECK-NEXT:    blt $a4, $a5, .LBB15_5
+; CHECK-NEXT:  # %bb.3: # in Loop: Header=BB15_2 Depth=1
+; CHECK-NEXT:    add.d $a0, $a3, $a0
+; CHECK-NEXT:    addi.w $a3, $a3, 1
+; CHECK-NEXT:    blt $a3, $a1, .LBB15_2
+; CHECK-NEXT:  .LBB15_4:
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB15_5:
+; CHECK-NEXT:    addi.w $a0, $a2, 0
+; CHECK-NEXT:    ret
+  %3 = icmp sgt i32 %1, 1
+  br i1 %3, label %4, label %12
+
+4:                                                ; preds = %2, %8
+  %5 = phi i32 [ %10, %8 ], [ 1, %2 ]
+  %6 = phi i32 [ %9, %8 ], [ %0, %2 ]
+  %7 = icmp sgt i32 %6, 1000
+  br i1 %7, label %12, label %8
+
+8:                                                ; preds = %4
+  %9 = add nsw i32 %5, %6
+  %10 = add nuw nsw i32 %5, 1
+  %11 = icmp slt i32 %10, %1
+  br i1 %11, label %4, label %12
+
+12:                                               ; preds = %8, %4, %2
+  %13 = phi i32 [ %0, %2 ], [ -1, %4 ], [ %9, %8 ]
+  ret i32 %13
+}
+
+define signext i32 @test14d(i31 zeroext %0, i32 signext %1) {
+; CHECK-LABEL: test14d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori $a2, $zero, 2
+; CHECK-NEXT:    blt $a1, $a2, .LBB16_4
+; CHECK-NEXT:  # %bb.1: # %.preheader
+; CHECK-NEXT:    ori $a3, $zero, 1
+; CHECK-NEXT:    addi.w $a2, $zero, -1
+; CHECK-NEXT:    lu32i.d $a2, 0
+; CHECK-NEXT:    ori $a4, $zero, 1000
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB16_2: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a5, $a0, 0
+; CHECK-NEXT:    blt $a4, $a5, .LBB16_5
+; CHECK-NEXT:  # %bb.3: # in Loop: Header=BB16_2 Depth=1
+; CHECK-NEXT:    add.d $a0, $a3, $a0
+; CHECK-NEXT:    addi.w $a3, $a3, 1
+; CHECK-NEXT:    blt $a3, $a1, .LBB16_2
+; CHECK-NEXT:  .LBB16_4:
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB16_5:
+; CHECK-NEXT:    addi.w $a0, $a2, 0
+; CHECK-NEXT:    ret
+  %zext = zext i31 %0 to i32
+  %3 = icmp sgt i32 %1, 1
+  br i1 %3, label %4, label %12
+
+4:                                                ; preds = %2, %8
+  %5 = phi i32 [ %10, %8 ], [ 1, %2 ]
+  %6 = phi i32 [ %9, %8 ], [ %zext, %2 ]
+  %7 = icmp sgt i32 %6, 1000
+  br i1 %7, label %12, label %8
+
+8:                                                ; preds = %4
+  %9 = add nsw i32 %5, %6
+  %10 = add nuw nsw i32 %5, 1
+  %11 = icmp slt i32 %10, %1
+  br i1 %11, label %4, label %12
+
+12:                                               ; preds = %8, %4, %2
+  %13 = phi i32 [ %zext, %2 ], [ -1, %4 ], [ %9, %8 ]
+  ret i32 %13
+}
+
+define signext i32 @test15(i64 %arg1, i64 %arg2, i64 %arg3, ptr %arg4)  {
+; CHECK-LABEL: test15:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi.d $a2, $a2, -1
+; CHECK-NEXT:    ori $a4, $zero, 256
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB17_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    andi $a0, $a0, 1234
+; CHECK-NEXT:    add.d $a0, $a0, $a1
+; CHECK-NEXT:    addi.d $a2, $a2, 1
+; CHECK-NEXT:    st.w $a0, $a3, 0
+; CHECK-NEXT:    bltu $a2, $a4, .LBB17_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %entry
+  %i1 = phi i64 [ %arg1, %entry ], [ %i5, %bb2 ]
+  %i2 = phi i64 [ %arg3, %entry ], [ %i3, %bb2 ]
+  %i3 = add i64 %i2, 1
+  %i4 = and i64 %i1, 1234
+  %i5 = add i64 %i4, %arg2
+  %i8 = trunc i64 %i5 to i32
+  store i32 %i8, ptr %arg4
+  %i6 = icmp ugt i64 %i2, 255
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  %i7 = trunc i64 %i5 to i32
+  ret i32 %i7
+}
+
+define signext i32 @bug(i32 signext %x) {
+; CHECK-LABEL: bug:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    beqz $a0, .LBB18_2
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    bstrpick.d $a1, $a0, 31, 16
+; CHECK-NEXT:    sltui $a1, $a1, 1
+; CHECK-NEXT:    slli.d $a2, $a0, 16
+; CHECK-NEXT:    masknez $a0, $a0, $a1
+; CHECK-NEXT:    maskeqz $a2, $a2, $a1
+; CHECK-NEXT:    or $a0, $a2, $a0
+; CHECK-NEXT:    ori $a2, $zero, 32
+; CHECK-NEXT:    masknez $a2, $a2, $a1
+; CHECK-NEXT:    ori $a3, $zero, 16
+; CHECK-NEXT:    maskeqz $a1, $a3, $a1
+; CHECK-NEXT:    or $a1, $a1, $a2
+; CHECK-NEXT:    bstrpick.d $a2, $a0, 31, 24
+; CHECK-NEXT:    sltui $a2, $a2, 1
+; CHECK-NEXT:    slli.d $a3, $a0, 8
+; CHECK-NEXT:    addi.d $a4, $a1, -8
+; CHECK-NEXT:    masknez $a0, $a0, $a2
+; CHECK-NEXT:    maskeqz $a3, $a3, $a2
+; CHECK-NEXT:    or $a0, $a3, $a0
+; CHECK-NEXT:    masknez $a1, $a1, $a2
+; CHECK-NEXT:    maskeqz $a2, $a4, $a2
+; CHECK-NEXT:    or $a1, $a2, $a1
+; CHECK-NEXT:    bstrpick.d $a2, $a0, 31, 28
+; CHECK-NEXT:    sltui $a2, $a2, 1
+; CHECK-NEXT:    slli.d $a3, $a0, 4
+; CHECK-NEXT:    addi.d $a4, $a1, -4
+; CHECK-NEXT:    masknez $a0, $a0, $a2
+; CHECK-NEXT:    maskeqz $a3, $a3, $a2
+; CHECK-NEXT:    or $a0, $a3, $a0
+; CHECK-NEXT:    masknez $a1, $a1, $a2
+; CHECK-NEXT:    maskeqz $a2, $a4, $a2
+; CHECK-NEXT:    or $a1, $a2, $a1
+; CHECK-NEXT:    bstrpick.d $a2, $a0, 31, 30
+; CHECK-NEXT:    sltui $a2, $a2, 1
+; CHECK-NEXT:    slli.d $a3, $a0, 2
+; CHECK-NEXT:    addi.d $a4, $a1, -2
+; CHECK-NEXT:    masknez $a0, $a0, $a2
+; CHECK-NEXT:    maskeqz $a3, $a3, $a2
+; CHECK-NEXT:    or $a0, $a3, $a0
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    masknez $a1, $a1, $a2
+; CHECK-NEXT:    maskeqz $a2, $a4, $a2
+; CHECK-NEXT:    or $a1, $a2, $a1
+; CHECK-NEXT:    srai.d $a0, $a0, 31
+; CHECK-NEXT:    nor $a0, $a0, $zero
+; CHECK-NEXT:    add.d $a0, $a1, $a0
+; CHECK-NEXT:    addi.w $a0, $a0, 0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB18_2:
+; CHECK-NEXT:    addi.w $a0, $zero, 0
+; CHECK-NEXT:    ret
+entry:
+  %tobool.not = icmp eq i32 %x, 0
+  br i1 %tobool.not, label %cleanup, label %if.end
+
+if.end:                                           ; preds = %entry
+  %tobool1.not = icmp ult i32 %x, 65536
+  %shl = shl i32 %x, 16
+  %spec.select = select i1 %tobool1.not, i32 %shl, i32 %x
+  %spec.select43 = select i1 %tobool1.not, i32 16, i32 32
+  %tobool5.not = icmp ult i32 %spec.select, 16777216
+  %shl7 = shl i32 %spec.select, 8
+  %sub8 = add nsw i32 %spec.select43, -8
+  %x.addr.1 = select i1 %tobool5.not, i32 %shl7, i32 %spec.select
+  %r.1 = select i1 %tobool5.not, i32 %sub8, i32 %spec.select43
+  %tobool11.not = icmp ult i32 %x.addr.1, 268435456
+  %shl13 = shl i32 %x.addr.1, 4
+  %sub14 = add nsw i32 %r.1, -4
+  %x.addr.2 = select i1 %tobool11.not, i32 %shl13, i32 %x.addr.1
+  %r.2 = select i1 %tobool11.not, i32 %sub14, i32 %r.1
+  %tobool17.not = icmp ult i32 %x.addr.2, 1073741824
+  %shl19 = shl i32 %x.addr.2, 2
+  %sub20 = add nsw i32 %r.2, -2
+  %x.addr.3 = select i1 %tobool17.not, i32 %shl19, i32 %x.addr.2
+  %r.3 = select i1 %tobool17.not, i32 %sub20, i32 %r.2
+  %x.addr.3.lobit = ashr i32 %x.addr.3, 31
+  %x.addr.3.lobit.not = xor i32 %x.addr.3.lobit, -1
+  %r.4 = add nsw i32 %r.3, %x.addr.3.lobit.not
+  br label %cleanup
+
+cleanup:                                          ; preds = %entry, %if.end
+  %retval.0 = phi i32 [ %r.4, %if.end ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define void @test16(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test16:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -32
+; CHECK-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    move $fp, $a1
+; CHECK-NEXT:    bl %plt(bar)
+; CHECK-NEXT:    move $s0, $a0
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB19_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $s0, 0
+; CHECK-NEXT:    bl %plt(bar)
+; CHECK-NEXT:    sll.w $s0, $s0, $fp
+; CHECK-NEXT:    bnez $a0, .LBB19_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 32
+; CHECK-NEXT:    ret
+bb:
+  %i = call signext i32 @bar(i32 signext %arg)
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %i, %bb ], [ %i5, %bb2 ]
+  %i4 = tail call signext i32 @bar(i32 signext %i3)
+  %i5 = shl i32 %i3, %arg1
+  %i6 = icmp eq i32 %i4, 0
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+
+define void @test17(i32 signext %arg, i32 signext %arg1) nounwind {
+; CHECK-LABEL: test17:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -32
+; CHECK-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    move $fp, $a1
+; CHECK-NEXT:    bl %plt(bat)
+; CHECK-NEXT:    move $s0, $a0
+; CHECK-NEXT:    .p2align 4, , 16
+; CHECK-NEXT:  .LBB20_1: # %bb2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    addi.w $a0, $s0, 0
+; CHECK-NEXT:    bl %plt(bar)
+; CHECK-NEXT:    sll.w $s0, $s0, $fp
+; CHECK-NEXT:    bnez $a0, .LBB20_1
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 32
+; CHECK-NEXT:    ret
+bb:
+  %i = call zeroext i16 @bat(i32 signext %arg)
+  %zext = zext i16 %i to i32
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %i3 = phi i32 [ %zext, %bb ], [ %i5, %bb2 ]
+  %i4 = tail call signext i32 @bar(i32 signext %i3)
+  %i5 = shl i32 %i3, %arg1
+  %i6 = icmp eq i32 %i4, 0
+  br i1 %i6, label %bb7, label %bb2
+
+bb7:                                              ; preds = %bb2
+  ret void
+}
+declare zeroext i16 @bat(i32 signext)
+
+define signext i32 @sextw_sh2add(i1 zeroext %0, ptr %1, i32 signext %2, i32 signext %3, i32 signext %4) {
+; CHECK-LABEL: sextw_sh2add:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    alsl.d $a2, $a2, $a3, 2
+; CHECK-NEXT:    beqz $a0, .LBB21_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    st.w $a2, $a1, 0
+; CHECK-NEXT:  .LBB21_2:
+; CHECK-NEXT:    add.w $a0, $a2, $a4
+; CHECK-NEXT:    ret
+  %6 = shl i32 %2, 2
+  %7 = add i32 %6, %3
+  br i1 %0, label %8, label %9
+
+8:                                                ; preds = %5
+  store i32 %7, ptr %1, align 4
+  br label %9
+
+9:                                                ; preds = %5, %8
+  %10 = add i32 %7, %4
+  ret i32 %10
+}
+
+define signext i32 @test19(i64 %arg, i1 zeroext %c1, i1 zeroext %c2, ptr %p) nounwind {
+; CHECK-LABEL: test19:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    addi.d $sp, $sp, -16
+; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; CHECK-NEXT:    ori $a0, $zero, 35
+; CHECK-NEXT:    lu32i.d $a0, 1
+; CHECK-NEXT:    maskeqz $fp, $a0, $a1
+; CHECK-NEXT:    st.d $fp, $a3, 0
+; CHECK-NEXT:    beqz $a2, .LBB22_2
+; CHECK-NEXT:  # %bb.1: # %bb2
+; CHECK-NEXT:    move $a0, $zero
+; CHECK-NEXT:    bl %plt(bar)
+; CHECK-NEXT:    move $fp, $a0
+; CHECK-NEXT:  .LBB22_2: # %bb7
+; CHECK-NEXT:    bl %plt(side_effect)
+; CHECK-NEXT:    addi.w $a0, $fp, 0
+; CHECK-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 16
+; CHECK-NEXT:    ret
+bb:
+  %sel = select i1 %c1, i64 4294967331, i64 0
+  store i64 %sel, ptr %p, align 8
+  br i1 %c2, label %bb2, label %bb7
+
+bb2:                                              ; preds = %bb2, %bb
+  %i4 = call signext i32 @bar(i32 0)
+  %i4.sext = sext i32 %i4 to i64
+  br label %bb7
+
+bb7:                                              ; preds = %bb2
+  %phi = phi i64 [ %sel, %bb ], [ %i4.sext, %bb2 ]
+  %trunc = trunc i64 %phi to i32
+  call void @side_effect()
+  ret i32 %trunc
+}
+
+ declare void @side_effect(i64)
diff --git a/llvm/test/CodeGen/M68k/Arith/add-with-overflow.ll b/llvm/test/CodeGen/M68k/Arith/add-with-overflow.ll
index bd5e593edb33..70479b0b3ec6 100644
--- a/llvm/test/CodeGen/M68k/Arith/add-with-overflow.ll
+++ b/llvm/test/CodeGen/M68k/Arith/add-with-overflow.ll
@@ -35,7 +35,7 @@ define fastcc i1 @test6(i32 %v1, i32 %v2, ptr %X) nounwind {
 ; CHECK-NEXT:  ; %bb.1: ; %normal
 ; CHECK-NEXT:    move.l #0, (%a0)
 ; CHECK-NEXT:  .LBB1_2: ; %carry
-; CHECK-NEXT:    move.b #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
 entry:
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
diff --git a/llvm/test/CodeGen/M68k/Arith/add.ll b/llvm/test/CodeGen/M68k/Arith/add.ll
index 281751e3e183..a9eb0bb815b0 100644
--- a/llvm/test/CodeGen/M68k/Arith/add.ll
+++ b/llvm/test/CodeGen/M68k/Arith/add.ll
@@ -43,7 +43,7 @@ define fastcc void @test3(ptr inreg %a) nounwind {
 ; CHECK-NEXT:    suba.l #4, %sp
 ; CHECK-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
 ; CHECK-NEXT:    move.l (%a0), %d0
-; CHECK-NEXT:    move.l #0, %d1
+; CHECK-NEXT:    moveq #0, %d1
 ; CHECK-NEXT:    move.l #-2147483648, %d2
 ; CHECK-NEXT:    add.l (4,%a0), %d2
 ; CHECK-NEXT:    addx.l %d0, %d1
@@ -64,7 +64,7 @@ define fastcc void @test4(ptr inreg %a) nounwind {
 ; CHECK-NEXT:    suba.l #4, %sp
 ; CHECK-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
 ; CHECK-NEXT:    move.l (%a0), %d0
-; CHECK-NEXT:    move.l #0, %d1
+; CHECK-NEXT:    moveq #0, %d1
 ; CHECK-NEXT:    move.l #128, %d2
 ; CHECK-NEXT:    add.l (4,%a0), %d2
 ; CHECK-NEXT:    addx.l %d0, %d1
diff --git a/llvm/test/CodeGen/M68k/Arith/bitwise.ll b/llvm/test/CodeGen/M68k/Arith/bitwise.ll
index 70e4dd42bfb6..74fc543a5fb8 100644
--- a/llvm/test/CodeGen/M68k/Arith/bitwise.ll
+++ b/llvm/test/CodeGen/M68k/Arith/bitwise.ll
@@ -242,7 +242,7 @@ define i64 @lshr64(i64 %a, i64 %b) nounwind {
 ; CHECK-NEXT:    add.l #-32, %d1
 ; CHECK-NEXT:    bmi .LBB18_1
 ; CHECK-NEXT:  ; %bb.2:
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    bra .LBB18_3
 ; CHECK-NEXT:  .LBB18_1:
 ; CHECK-NEXT:    move.l %d2, %d0
@@ -301,7 +301,7 @@ define i64 @ashr64(i64 %a, i64 %b) nounwind {
 ; CHECK-NEXT:    add.l #-32, %d3
 ; CHECK-NEXT:    bmi .LBB19_5
 ; CHECK-NEXT:  ; %bb.4:
-; CHECK-NEXT:    move.l #31, %d2
+; CHECK-NEXT:    moveq #31, %d2
 ; CHECK-NEXT:  .LBB19_5:
 ; CHECK-NEXT:    asr.l %d2, %d0
 ; CHECK-NEXT:    movem.l (0,%sp), %d2-%d3 ; 12-byte Folded Reload
@@ -322,7 +322,7 @@ define i64 @shl64(i64 %a, i64 %b) nounwind {
 ; CHECK-NEXT:    add.l #-32, %d0
 ; CHECK-NEXT:    bmi .LBB20_1
 ; CHECK-NEXT:  ; %bb.2:
-; CHECK-NEXT:    move.l #0, %d1
+; CHECK-NEXT:    moveq #0, %d1
 ; CHECK-NEXT:    bra .LBB20_3
 ; CHECK-NEXT:  .LBB20_1:
 ; CHECK-NEXT:    move.l %d2, %d1
diff --git a/llvm/test/CodeGen/M68k/Arith/divide-by-constant.ll b/llvm/test/CodeGen/M68k/Arith/divide-by-constant.ll
index 834dfe1c26f0..fcc8dd3e7662 100644
--- a/llvm/test/CodeGen/M68k/Arith/divide-by-constant.ll
+++ b/llvm/test/CodeGen/M68k/Arith/divide-by-constant.ll
@@ -40,7 +40,7 @@ define zeroext i8 @test3(i8 zeroext %x, i8 zeroext %c) {
 ; CHECK-NEXT:    move.b (11,%sp), %d0
 ; CHECK-NEXT:    and.l #255, %d0
 ; CHECK-NEXT:    muls #171, %d0
-; CHECK-NEXT:    move.w #9, %d1
+; CHECK-NEXT:    moveq #9, %d1
 ; CHECK-NEXT:    lsr.w %d1, %d0
 ; CHECK-NEXT:    and.l #65535, %d0
 ; CHECK-NEXT:    rts
@@ -58,7 +58,7 @@ define signext i16 @test4(i16 signext %x) nounwind {
 ; CHECK-NEXT:    muls #1986, %d0
 ; CHECK-NEXT:    asr.l #8, %d0
 ; CHECK-NEXT:    asr.l #8, %d0
-; CHECK-NEXT:    move.w #15, %d1
+; CHECK-NEXT:    moveq #15, %d1
 ; CHECK-NEXT:    move.w %d0, %d2
 ; CHECK-NEXT:    lsr.w %d1, %d2
 ; CHECK-NEXT:    add.w %d2, %d0
@@ -94,7 +94,7 @@ define signext i16 @test6(i16 signext %x) nounwind {
 ; CHECK-NEXT:    muls #26215, %d0
 ; CHECK-NEXT:    asr.l #8, %d0
 ; CHECK-NEXT:    asr.l #8, %d0
-; CHECK-NEXT:    move.w #15, %d1
+; CHECK-NEXT:    moveq #15, %d1
 ; CHECK-NEXT:    move.w %d0, %d2
 ; CHECK-NEXT:    lsr.w %d1, %d2
 ; CHECK-NEXT:    asr.w #2, %d0
@@ -128,7 +128,7 @@ define i8 @test8(i8 %x) nounwind {
 ; CHECK-NEXT:    lsr.b #1, %d0
 ; CHECK-NEXT:    and.l #255, %d0
 ; CHECK-NEXT:    muls #211, %d0
-; CHECK-NEXT:    move.w #13, %d1
+; CHECK-NEXT:    moveq #13, %d1
 ; CHECK-NEXT:    lsr.w %d1, %d0
 ; CHECK-NEXT:    ; kill: def $bd0 killed $bd0 killed $d0
 ; CHECK-NEXT:    rts
@@ -143,7 +143,7 @@ define i8 @test9(i8 %x) nounwind {
 ; CHECK-NEXT:    lsr.b #2, %d0
 ; CHECK-NEXT:    and.l #255, %d0
 ; CHECK-NEXT:    muls #71, %d0
-; CHECK-NEXT:    move.w #11, %d1
+; CHECK-NEXT:    moveq #11, %d1
 ; CHECK-NEXT:    lsr.w %d1, %d0
 ; CHECK-NEXT:    ; kill: def $bd0 killed $bd0 killed $d0
 ; CHECK-NEXT:    rts
@@ -156,11 +156,11 @@ define i32 @testsize1(i32 %x) minsize nounwind {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    suba.l #4, %sp
 ; CHECK-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
-; CHECK-NEXT:    move.l #31, %d1
+; CHECK-NEXT:    moveq #31, %d1
 ; CHECK-NEXT:    move.l (8,%sp), %d0
 ; CHECK-NEXT:    move.l %d0, %d2
 ; CHECK-NEXT:    asr.l %d1, %d2
-; CHECK-NEXT:    move.l #27, %d1
+; CHECK-NEXT:    moveq #27, %d1
 ; CHECK-NEXT:    lsr.l %d1, %d2
 ; CHECK-NEXT:    add.l %d2, %d0
 ; CHECK-NEXT:    asr.l #5, %d0
diff --git a/llvm/test/CodeGen/M68k/Arith/imul.ll b/llvm/test/CodeGen/M68k/Arith/imul.ll
index f53568395c29..a1846e4d51bd 100644
--- a/llvm/test/CodeGen/M68k/Arith/imul.ll
+++ b/llvm/test/CodeGen/M68k/Arith/imul.ll
@@ -19,7 +19,7 @@ define i64 @mul4_64(i64 %A) {
 ; CHECK-NEXT:    suba.l #4, %sp
 ; CHECK-NEXT:    .cfi_def_cfa_offset -8
 ; CHECK-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
-; CHECK-NEXT:    move.l #30, %d0
+; CHECK-NEXT:    moveq #30, %d0
 ; CHECK-NEXT:    move.l (12,%sp), %d1
 ; CHECK-NEXT:    move.l %d1, %d2
 ; CHECK-NEXT:    lsr.l %d0, %d2
@@ -38,7 +38,7 @@ define i32 @mul4096_32(i32 %A) {
 ; CHECK-LABEL: mul4096_32:
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0:
-; CHECK-NEXT:    move.l #12, %d1
+; CHECK-NEXT:    moveq #12, %d1
 ; CHECK-NEXT:    move.l (4,%sp), %d0
 ; CHECK-NEXT:    lsl.l %d1, %d0
 ; CHECK-NEXT:    rts
@@ -53,11 +53,11 @@ define i64 @mul4096_64(i64 %A) {
 ; CHECK-NEXT:    suba.l #8, %sp
 ; CHECK-NEXT:    .cfi_def_cfa_offset -12
 ; CHECK-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
-; CHECK-NEXT:    move.l #20, %d0
+; CHECK-NEXT:    moveq #20, %d0
 ; CHECK-NEXT:    move.l (16,%sp), %d1
 ; CHECK-NEXT:    move.l %d1, %d2
 ; CHECK-NEXT:    lsr.l %d0, %d2
-; CHECK-NEXT:    move.l #12, %d3
+; CHECK-NEXT:    moveq #12, %d3
 ; CHECK-NEXT:    move.l (12,%sp), %d0
 ; CHECK-NEXT:    lsl.l %d3, %d0
 ; CHECK-NEXT:    or.l %d2, %d0
@@ -73,7 +73,7 @@ define i32 @mulmin4096_32(i32 %A) {
 ; CHECK-LABEL: mulmin4096_32:
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0:
-; CHECK-NEXT:    move.l #12, %d1
+; CHECK-NEXT:    moveq #12, %d1
 ; CHECK-NEXT:    move.l (4,%sp), %d0
 ; CHECK-NEXT:    lsl.l %d1, %d0
 ; CHECK-NEXT:    neg.l %d0
@@ -89,11 +89,11 @@ define i64 @mulmin4096_64(i64 %A) {
 ; CHECK-NEXT:    suba.l #8, %sp
 ; CHECK-NEXT:    .cfi_def_cfa_offset -12
 ; CHECK-NEXT:    movem.l %d2-%d3, (0,%sp) ; 12-byte Folded Spill
-; CHECK-NEXT:    move.l #20, %d0
+; CHECK-NEXT:    moveq #20, %d0
 ; CHECK-NEXT:    move.l (16,%sp), %d1
 ; CHECK-NEXT:    move.l %d1, %d2
 ; CHECK-NEXT:    lsr.l %d0, %d2
-; CHECK-NEXT:    move.l #12, %d3
+; CHECK-NEXT:    moveq #12, %d3
 ; CHECK-NEXT:    move.l (12,%sp), %d0
 ; CHECK-NEXT:    lsl.l %d3, %d0
 ; CHECK-NEXT:    or.l %d2, %d0
@@ -258,7 +258,7 @@ define i32 @mul0_32(i32 %A) {
 ; CHECK-LABEL: mul0_32:
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0:
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
     %mul = mul i32 %A, 0
     ret i32 %mul
diff --git a/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll b/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll
index 5bd4d5d48bc8..10a797f13441 100644
--- a/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll
@@ -24,7 +24,7 @@ entry:
 define zeroext i8 @smul_i8_no_ovf(i8 signext %a, i8 signext %b) nounwind ssp {
 ; CHECK-LABEL: smul_i8_no_ovf:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    move.l #42, %d0
+; CHECK-NEXT:    moveq #42, %d0
 ; CHECK-NEXT:    rts
 entry:
   %smul = tail call { i8, i1 } @llvm.smul.with.overflow.i8(i8 %a, i8 %b)
@@ -70,7 +70,7 @@ define fastcc i1 @test1(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    lea (no,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
-; CHECK-NEXT:    move.b #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 ; CHECK-NEXT:  .LBB3_1: ; %normal
@@ -78,7 +78,7 @@ define fastcc i1 @test1(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    lea (ok,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
-; CHECK-NEXT:    move.b #1, %d0
+; CHECK-NEXT:    moveq #1, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 entry:
@@ -108,7 +108,7 @@ define fastcc i1 @test2(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    lea (no,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
-; CHECK-NEXT:    move.b #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 ; CHECK-NEXT:  .LBB4_2: ; %normal
@@ -116,7 +116,7 @@ define fastcc i1 @test2(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    lea (ok,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
-; CHECK-NEXT:    move.b #1, %d0
+; CHECK-NEXT:    moveq #1, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 entry:
@@ -155,7 +155,7 @@ define i32 @test4(i32 %a, i32 %b) nounwind readnone {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    move.l (8,%sp), %d0
 ; CHECK-NEXT:    add.l (4,%sp), %d0
-; CHECK-NEXT:    move.l #4, %d1
+; CHECK-NEXT:    moveq #4, %d1
 ; CHECK-NEXT:    muls.l %d1, %d0
 ; CHECK-NEXT:    rts
 entry:
diff --git a/llvm/test/CodeGen/M68k/Arith/sub-with-overflow.ll b/llvm/test/CodeGen/M68k/Arith/sub-with-overflow.ll
index 8d47c7ebf7e5..be3223156986 100644
--- a/llvm/test/CodeGen/M68k/Arith/sub-with-overflow.ll
+++ b/llvm/test/CodeGen/M68k/Arith/sub-with-overflow.ll
@@ -19,7 +19,7 @@ define i1 @func1(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    lea (no,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
-; CHECK-NEXT:    move.b #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 ; CHECK-NEXT:  .LBB0_1: ; %normal
@@ -27,7 +27,7 @@ define i1 @func1(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    lea (ok,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
-; CHECK-NEXT:    move.b #1, %d0
+; CHECK-NEXT:    moveq #1, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 entry:
@@ -56,7 +56,7 @@ define i1 @func2(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    lea (no,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
-; CHECK-NEXT:    move.b #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 ; CHECK-NEXT:  .LBB1_1: ; %normal
@@ -64,7 +64,7 @@ define i1 @func2(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    lea (ok,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
-; CHECK-NEXT:    move.b #1, %d0
+; CHECK-NEXT:    moveq #1, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 entry:
@@ -85,7 +85,7 @@ carry:
 define i1 @func3(i32 %x) nounwind {
 ; CHECK-LABEL: func3:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    move.l #-1, %d0
+; CHECK-NEXT:    moveq #-1, %d0
 ; CHECK-NEXT:    add.l (4,%sp), %d0
 ; CHECK-NEXT:    svs %d0
 ; CHECK-NEXT:    rts
diff --git a/llvm/test/CodeGen/M68k/Arith/sub.ll b/llvm/test/CodeGen/M68k/Arith/sub.ll
index fff3601000df..16d0498b3dbb 100644
--- a/llvm/test/CodeGen/M68k/Arith/sub.ll
+++ b/llvm/test/CodeGen/M68k/Arith/sub.ll
@@ -7,7 +7,7 @@ define i32 @test1(i32 %x) {
 ; CHECK-NEXT:  ; %bb.0:
 ; CHECK-NEXT:    move.l (4,%sp), %d1
 ; CHECK-NEXT:    eori.l #31, %d1
-; CHECK-NEXT:    move.l #32, %d0
+; CHECK-NEXT:    moveq #32, %d0
 ; CHECK-NEXT:    sub.l %d1, %d0
 ; CHECK-NEXT:    rts
   %xor = xor i32 %x, 31
diff --git a/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll b/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll
index fd128a3e52bd..3314e65399c4 100644
--- a/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll
@@ -24,7 +24,7 @@ entry:
 define zeroext i8 @umul_i8_no_ovf(i8 signext %a, i8 signext %b) nounwind ssp {
 ; CHECK-LABEL: umul_i8_no_ovf:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    move.l #42, %d0
+; CHECK-NEXT:    moveq #42, %d0
 ; CHECK-NEXT:    rts
 entry:
   %umul = tail call { i8, i1 } @llvm.umul.with.overflow.i8(i8 %a, i8 %b)
@@ -59,7 +59,7 @@ declare {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
 define i1 @a(i32 %x)  nounwind {
 ; CHECK-LABEL: a:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    move.l #3, %d0
+; CHECK-NEXT:    moveq #3, %d0
 ; CHECK-NEXT:    move.l (4,%sp), %d1
 ; CHECK-NEXT:    mulu.l %d0, %d1
 ; CHECK-NEXT:    svs %d0
@@ -90,7 +90,7 @@ define i32 @test3(i32 %a, i32 %b) nounwind readnone {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    move.l (8,%sp), %d0
 ; CHECK-NEXT:    add.l (4,%sp), %d0
-; CHECK-NEXT:    move.l #4, %d1
+; CHECK-NEXT:    moveq #4, %d1
 ; CHECK-NEXT:    mulu.l %d1, %d0
 ; CHECK-NEXT:    rts
 entry:
diff --git a/llvm/test/CodeGen/M68k/CConv/c-call.ll b/llvm/test/CodeGen/M68k/CConv/c-call.ll
index a9638eec6a31..badd4e31f37d 100644
--- a/llvm/test/CodeGen/M68k/CConv/c-call.ll
+++ b/llvm/test/CodeGen/M68k/CConv/c-call.ll
@@ -14,7 +14,7 @@ define i32 @test1() nounwind {
 ; CHECK-NEXT:    move.l #2, (4,%sp)
 ; CHECK-NEXT:    move.l #1, (%sp)
 ; CHECK-NEXT:    jsr (test1_callee@PLT,%pc)
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #20, %sp
 ; CHECK-NEXT:    rts
 entry:
@@ -34,7 +34,7 @@ define i16 @test2() nounwind {
 ; CHECK-NEXT:    move.l #2, (4,%sp)
 ; CHECK-NEXT:    move.l #1, (%sp)
 ; CHECK-NEXT:    jsr (test2_callee@PLT,%pc)
-; CHECK-NEXT:    move.w #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #20, %sp
 ; CHECK-NEXT:    rts
 entry:
@@ -54,7 +54,7 @@ define i8 @test3() nounwind {
 ; CHECK-NEXT:    move.l #2, (4,%sp)
 ; CHECK-NEXT:    move.l #1, (%sp)
 ; CHECK-NEXT:    jsr (test3_callee@PLT,%pc)
-; CHECK-NEXT:    move.b #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #20, %sp
 ; CHECK-NEXT:    rts
 entry:
diff --git a/llvm/test/CodeGen/M68k/CConv/fastcc-call.ll b/llvm/test/CodeGen/M68k/CConv/fastcc-call.ll
index 4b0f8ed254a5..8d40ebd5228f 100644
--- a/llvm/test/CodeGen/M68k/CConv/fastcc-call.ll
+++ b/llvm/test/CodeGen/M68k/CConv/fastcc-call.ll
@@ -11,12 +11,12 @@ define i32 @foo1() nounwind uwtable {
 ; CHECK-NEXT:    suba.l #4, %sp
 ; CHECK-NEXT:    .cfi_def_cfa_offset -8
 ; CHECK-NEXT:    move.l #5, (%sp)
-; CHECK-NEXT:    move.l #1, %d0
-; CHECK-NEXT:    move.l #2, %d1
+; CHECK-NEXT:    moveq #1, %d0
+; CHECK-NEXT:    moveq #2, %d1
 ; CHECK-NEXT:    move.l #3, %a0
 ; CHECK-NEXT:    move.l #4, %a1
 ; CHECK-NEXT:    jsr (bar1@PLT,%pc)
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #4, %sp
 ; CHECK-NEXT:    rts
 entry:
@@ -34,11 +34,11 @@ define i32 @foo2() nounwind uwtable {
 ; CHECK-NEXT:    suba.l #12, %sp
 ; CHECK-NEXT:    .cfi_def_cfa_offset -16
 ; CHECK-NEXT:    lea (8,%sp), %a0
-; CHECK-NEXT:    move.l #2, %d0
+; CHECK-NEXT:    moveq #2, %d0
 ; CHECK-NEXT:    lea (4,%sp), %a1
-; CHECK-NEXT:    move.l #4, %d1
+; CHECK-NEXT:    moveq #4, %d1
 ; CHECK-NEXT:    jsr (bar2@PLT,%pc)
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 entry:
diff --git a/llvm/test/CodeGen/M68k/CodeModel/medium-pie-global-access.ll b/llvm/test/CodeGen/M68k/CodeModel/medium-pie-global-access.ll
index ce8f2d0a6ba7..3d398afe7dc4 100644
--- a/llvm/test/CodeGen/M68k/CodeModel/medium-pie-global-access.ll
+++ b/llvm/test/CodeGen/M68k/CodeModel/medium-pie-global-access.ll
@@ -71,7 +71,7 @@ define i32 @my_access_global_store_d() #0 {
 ; CHECK-NEXT:  ; %bb.0: ; %entry
 ; CHECK-NEXT:    move.l (d@GOTPCREL,%pc), %a0
 ; CHECK-NEXT:    move.l #2, (%a0)
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
 entry:
  store i32 2, ptr @d, align 4
@@ -105,7 +105,7 @@ define linkonce_odr i32 @bar() comdat {
 ; CHECK-LABEL: bar:
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0: ; %entry
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
 entry:
  ret i32 0
diff --git a/llvm/test/CodeGen/M68k/CodeModel/small-pie-global-access.ll b/llvm/test/CodeGen/M68k/CodeModel/small-pie-global-access.ll
index 668f8a96ac6f..030f72bb3753 100644
--- a/llvm/test/CodeGen/M68k/CodeModel/small-pie-global-access.ll
+++ b/llvm/test/CodeGen/M68k/CodeModel/small-pie-global-access.ll
@@ -69,7 +69,7 @@ define i32 @my_access_global_store_d() #0 {
 ; CHECK-NEXT:  ; %bb.0: ; %entry
 ; CHECK-NEXT:    move.l (d@GOTPCREL,%pc), %a0
 ; CHECK-NEXT:    move.l #2, (%a0)
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
 entry:
  store i32 2, ptr @d, align 4
@@ -103,7 +103,7 @@ define linkonce_odr i32 @bar() comdat {
 ; CHECK-LABEL: bar:
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0: ; %entry
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
 entry:
  ret i32 0
diff --git a/llvm/test/CodeGen/M68k/Control/cmp.ll b/llvm/test/CodeGen/M68k/Control/cmp.ll
index 634c08760a4e..d3a8bbb0b0c8 100644
--- a/llvm/test/CodeGen/M68k/Control/cmp.ll
+++ b/llvm/test/CodeGen/M68k/Control/cmp.ll
@@ -8,10 +8,10 @@ define i32 @test1(ptr %y) nounwind {
 ; CHECK-NEXT:    cmpi.l #0, (%a0)
 ; CHECK-NEXT:    beq .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %cond_false
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
 ; CHECK-NEXT:  .LBB0_2: ; %cond_true
-; CHECK-NEXT:    move.l #1, %d0
+; CHECK-NEXT:    moveq #1, %d0
 ; CHECK-NEXT:    rts
  %tmp = load i32, ptr %y  ; <i32> [#uses=1]
  %tmp.upgrd.1 = icmp eq i32 %tmp, 0  ; <i1> [#uses=1]
@@ -33,10 +33,10 @@ define i32 @test2(ptr %y) nounwind {
 ; CHECK-NEXT:    cmpi.l #0, %d0
 ; CHECK-NEXT:    beq .LBB1_2
 ; CHECK-NEXT:  ; %bb.1: ; %cond_false
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
 ; CHECK-NEXT:  .LBB1_2: ; %cond_true
-; CHECK-NEXT:    move.l #1, %d0
+; CHECK-NEXT:    moveq #1, %d0
 ; CHECK-NEXT:    rts
  %tmp = load i32, ptr %y  ; <i32> [#uses=1]
  %tmp1 = shl i32 %tmp, 3  ; <i32> [#uses=1]
@@ -59,10 +59,10 @@ define i8 @test2b(ptr %y) nounwind {
 ; CHECK-NEXT:    cmpi.b #0, %d0
 ; CHECK-NEXT:    beq .LBB2_2
 ; CHECK-NEXT:  ; %bb.1: ; %cond_false
-; CHECK-NEXT:    move.b #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
 ; CHECK-NEXT:  .LBB2_2: ; %cond_true
-; CHECK-NEXT:    move.b #1, %d0
+; CHECK-NEXT:    moveq #1, %d0
 ; CHECK-NEXT:    rts
  %tmp = load i8, ptr %y  ; <i8> [#uses=1]
  %tmp1 = shl i8 %tmp, 3  ; <i8> [#uses=1]
@@ -84,7 +84,7 @@ define i64 @test3(i64 %x) nounwind {
 ; CHECK-NEXT:    seq %d0
 ; CHECK-NEXT:    move.l %d0, %d1
 ; CHECK-NEXT:    and.l #255, %d1
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
   %t = icmp eq i64 %x, 0
   %r = zext i1 %t to i64
@@ -97,7 +97,7 @@ define i64 @test4(i64 %x) nounwind {
 ; CHECK-NEXT:    suba.l #4, %sp
 ; CHECK-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
 ; CHECK-NEXT:    move.l (8,%sp), %d1
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    move.l (12,%sp), %d2
 ; CHECK-NEXT:    sub.l #1, %d2
 ; CHECK-NEXT:    subx.l %d0, %d1
@@ -119,11 +119,11 @@ define i32 @test6() nounwind align 2 {
 ; CHECK-NEXT:    or.l (8,%sp), %d0
 ; CHECK-NEXT:    beq .LBB5_1
 ; CHECK-NEXT:  ; %bb.2: ; %F
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #20, %sp
 ; CHECK-NEXT:    rts
 ; CHECK-NEXT:  .LBB5_1: ; %T
-; CHECK-NEXT:    move.l #1, %d0
+; CHECK-NEXT:    moveq #1, %d0
 ; CHECK-NEXT:    adda.l #20, %sp
 ; CHECK-NEXT:    rts
   %A = alloca {i64, i64}, align 8
@@ -229,7 +229,7 @@ define zeroext i1 @test15(i32 %bf.load, i32 %n) {
 ; CHECK-LABEL: test15:
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0:
-; CHECK-NEXT:    move.l #16, %d0
+; CHECK-NEXT:    moveq #16, %d0
 ; CHECK-NEXT:    move.l (4,%sp), %d1
 ; CHECK-NEXT:    lsr.l %d0, %d1
 ; CHECK-NEXT:    move.l %d1, %d0
@@ -252,7 +252,7 @@ define i8 @test16(i16 signext %L) {
 ; CHECK-LABEL: test16:
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0:
-; CHECK-NEXT:    move.w #15, %d1
+; CHECK-NEXT:    moveq #15, %d1
 ; CHECK-NEXT:    move.w (6,%sp), %d0
 ; CHECK-NEXT:    lsr.w %d1, %d0
 ; CHECK-NEXT:    eori.b #1, %d0
@@ -268,7 +268,7 @@ define i8 @test18(i64 %L) {
 ; CHECK-LABEL: test18:
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0:
-; CHECK-NEXT:    move.l #31, %d1
+; CHECK-NEXT:    moveq #31, %d1
 ; CHECK-NEXT:    move.l (4,%sp), %d0
 ; CHECK-NEXT:    lsr.l %d1, %d0
 ; CHECK-NEXT:    eori.b #1, %d0
diff --git a/llvm/test/CodeGen/M68k/Control/long-setcc.ll b/llvm/test/CodeGen/M68k/Control/long-setcc.ll
index b089af5f2ae8..45a617599c1e 100644
--- a/llvm/test/CodeGen/M68k/Control/long-setcc.ll
+++ b/llvm/test/CodeGen/M68k/Control/long-setcc.ll
@@ -4,7 +4,7 @@
 define i1 @t1(i64 %x) nounwind {
 ; CHECK-LABEL: t1:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    move.l #31, %d1
+; CHECK-NEXT:    moveq #31, %d1
 ; CHECK-NEXT:    move.l (4,%sp), %d0
 ; CHECK-NEXT:    lsr.l %d1, %d0
 ; CHECK-NEXT:    ; kill: def $bd0 killed $bd0 killed $d0
@@ -26,7 +26,7 @@ define i1 @t2(i64 %x) nounwind {
 define i1 @t3(i32 %x) nounwind {
 ; CHECK-LABEL: t3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    move.b #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    rts
   %tmp = icmp ugt i32 %x, -1
   ret i1 %tmp
diff --git a/llvm/test/CodeGen/M68k/Control/setcc.ll b/llvm/test/CodeGen/M68k/Control/setcc.ll
index 63856e278c9e..9e03f9b90842 100644
--- a/llvm/test/CodeGen/M68k/Control/setcc.ll
+++ b/llvm/test/CodeGen/M68k/Control/setcc.ll
@@ -40,7 +40,7 @@ define fastcc i64 @t3(i64 %x) nounwind readnone ssp {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    suba.l #4, %sp
 ; CHECK-NEXT:    movem.l %d2, (0,%sp) ; 8-byte Folded Spill
-; CHECK-NEXT:    move.l #0, %d2
+; CHECK-NEXT:    moveq #0, %d2
 ; CHECK-NEXT:    sub.l #18, %d1
 ; CHECK-NEXT:    subx.l %d2, %d0
 ; CHECK-NEXT:    scs %d0
@@ -61,7 +61,7 @@ define i8 @t5(i32 %a) {
 ; CHECK-LABEL: t5:
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0: ; %entry
-; CHECK-NEXT:    move.l #31, %d1
+; CHECK-NEXT:    moveq #31, %d1
 ; CHECK-NEXT:    move.l (4,%sp), %d0
 ; CHECK-NEXT:    lsr.l %d1, %d0
 ; CHECK-NEXT:    eori.b #1, %d0
@@ -86,7 +86,7 @@ define zeroext i1 @t6(i32 %a) {
 ; CHECK-LABEL: t6:
 ; CHECK:         .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0: ; %entry
-; CHECK-NEXT:    move.l #31, %d0
+; CHECK-NEXT:    moveq #31, %d0
 ; CHECK-NEXT:    move.l (4,%sp), %d1
 ; CHECK-NEXT:    lsr.l %d0, %d1
 ; CHECK-NEXT:    eori.b #1, %d1
diff --git a/llvm/test/CodeGen/M68k/PR57660.ll b/llvm/test/CodeGen/M68k/PR57660.ll
index 184c30a33d79..bad949b08caf 100644
--- a/llvm/test/CodeGen/M68k/PR57660.ll
+++ b/llvm/test/CodeGen/M68k/PR57660.ll
@@ -7,7 +7,7 @@ define dso_local void @foo1() {
 ; CHECK-NEXT:  ; %bb.0: ; %entry
 ; CHECK-NEXT:    suba.l #2, %sp
 ; CHECK-NEXT:    .cfi_def_cfa_offset -6
-; CHECK-NEXT:    move.b #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    move.b %d0, (0,%sp) ; 1-byte Folded Spill
 ; CHECK-NEXT:  .LBB0_1: ; %do.body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/M68k/gcc_except_table.ll b/llvm/test/CodeGen/M68k/gcc_except_table.ll
index a7d2a6662724..fe0ed7861dfe 100644
--- a/llvm/test/CodeGen/M68k/gcc_except_table.ll
+++ b/llvm/test/CodeGen/M68k/gcc_except_table.ll
@@ -19,7 +19,7 @@ define i32 @foo() uwtable ssp personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    jsr _Z1fv@PLT
 ; CHECK-NEXT:  .Ltmp1:
 ; CHECK-NEXT:  ; %bb.1: ; %try.cont
-; CHECK-NEXT:    move.l #0, %d0
+; CHECK-NEXT:    moveq #0, %d0
 ; CHECK-NEXT:    adda.l #4, %sp
 ; CHECK-NEXT:    rts
 ; CHECK-NEXT:  .LBB0_2: ; %lpad
diff --git a/llvm/test/CodeGen/M68k/link-unlnk.ll b/llvm/test/CodeGen/M68k/link-unlnk.ll
index dfdd80e66ade..fe39a9a13494 100644
--- a/llvm/test/CodeGen/M68k/link-unlnk.ll
+++ b/llvm/test/CodeGen/M68k/link-unlnk.ll
@@ -105,7 +105,7 @@ define i32 @test_gep() {
 ; FP-NEXT:    .cfi_def_cfa_register %a6
 ; FP-NEXT:    move.l #21, (-4,%a6)
 ; FP-NEXT:    move.l #12, (-256,%a6)
-; FP-NEXT:    move.l #0, %d0
+; FP-NEXT:    moveq #0, %d0
 ; FP-NEXT:    unlk %a6
 ; FP-NEXT:    rts
 ;
@@ -116,7 +116,7 @@ define i32 @test_gep() {
 ; NO-FP-NEXT:    .cfi_def_cfa_offset -260
 ; NO-FP-NEXT:    move.l #21, (252,%sp)
 ; NO-FP-NEXT:    move.l #12, (0,%sp)
-; NO-FP-NEXT:    move.l #0, %d0
+; NO-FP-NEXT:    moveq #0, %d0
 ; NO-FP-NEXT:    adda.l #256, %sp
 ; NO-FP-NEXT:    rts
 entry:
diff --git a/llvm/test/CodeGen/Mips/mipsr6-minmaxnum.ll b/llvm/test/CodeGen/Mips/mipsr6-minmaxnum.ll
new file mode 100644
index 000000000000..e14e89916e6d
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/mipsr6-minmaxnum.ll
@@ -0,0 +1,69 @@
+; RUN: llc %s -mtriple=mipsisa32r6el-linux-gnu -o - | \
+; RUN:     FileCheck %s --check-prefix=MIPS32R6EL
+; RUN: llc %s -mtriple=mipsisa64r6el-linux-gnuabi64 -o - | \
+; RUN:     FileCheck %s --check-prefix=MIPS64R6EL
+
+define float @mins(float %x, float %y) {
+; MIPS32R6EL-LABEL:	mins
+; MIPS32R6EL:		# %bb.0:
+; MIPS32R6EL-NEXT:	jr	$ra
+; MIPS32R6EL-NEXT:	min.s	$f0, $f12, $f14
+;
+; MIPS64R6EL-LABEL:	mins
+; MIPS64R6EL:		# %bb.0:
+; MIPS64R6EL-NEXT:	jr	$ra
+; MIPS64R6EL-NEXT:	min.s	$f0, $f12, $f13
+
+  %r = tail call float @llvm.minnum.f32(float %x, float %y)
+  ret float %r
+}
+
+define float @maxs(float %x, float %y) {
+; MIPS32R6EL-LABEL:	maxs
+; MIPS32R6EL:		# %bb.0:
+; MIPS32R6EL-NEXT:	jr	$ra
+; MIPS32R6EL-NEXT:	max.s	$f0, $f12, $f14
+;
+; MIPS64R6EL-LABEL:	maxs
+; MIPS64R6EL:		# %bb.0:
+; MIPS64R6EL-NEXT:	jr	$ra
+; MIPS64R6EL-NEXT:	max.s	$f0, $f12, $f13
+
+  %r = tail call float @llvm.maxnum.f32(float %x, float %y)
+  ret float %r
+}
+
+define double @mind(double %x, double %y) {
+; MIPS32R6EL-LABEL:	mind
+; MIPS32R6EL:		# %bb.0:
+; MIPS32R6EL-NEXT:	jr	$ra
+; MIPS32R6EL-NEXT:	min.d	$f0, $f12, $f14
+;
+; MIPS64R6EL-LABEL:	mind
+; MIPS64R6EL:		# %bb.0:
+; MIPS64R6EL-NEXT:	jr	$ra
+; MIPS64R6EL-NEXT:	min.d	$f0, $f12, $f13
+
+  %r = tail call double @llvm.minnum.f64(double %x, double %y)
+  ret double %r
+}
+
+define double @maxd(double %x, double %y) {
+; MIPS32R6EL-LABEL:	maxd
+; MIPS32R6EL:		# %bb.0:
+; MIPS32R6EL-NEXT:	jr	$ra
+; MIPS32R6EL-NEXT:	max.d	$f0, $f12, $f14
+;
+; MIPS64R6EL-LABEL:	maxd
+; MIPS64R6EL:		# %bb.0:
+; MIPS64R6EL-NEXT:	jr	$ra
+; MIPS64R6EL-NEXT:	max.d	$f0, $f12, $f13
+
+  %r = tail call double @llvm.maxnum.f64(double %x, double %y)
+  ret double %r
+}
+
+declare float @llvm.minnum.f32(float, float)
+declare float @llvm.maxnum.f32(float, float)
+declare double @llvm.minnum.f64(double, double)
+declare double @llvm.maxnum.f64(double, double)
diff --git a/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll b/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll
index 45c7ab980edd..fe68bee408fc 100644
--- a/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll
+++ b/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll
@@ -2365,101 +2365,159 @@ entry:
 declare float @llvm.minnum.f32(float %Val, float %b)
 
 define void @fminnum(float %b) {
-; MIPS32-LABEL: fminnum:
-; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
-; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
-; MIPS32-NEXT:    addiu $sp, $sp, -24
-; MIPS32-NEXT:    .cfi_def_cfa_offset 24
-; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
-; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
-; MIPS32-NEXT:    .cfi_offset 31, -4
-; MIPS32-NEXT:    .cfi_offset 16, -8
-; MIPS32-NEXT:    addu $gp, $2, $25
-; MIPS32-NEXT:    mov.s $f14, $f12
-; MIPS32-NEXT:    lw $16, %got(g)($gp)
-; MIPS32-NEXT:    lh $1, 0($16)
-; MIPS32-NEXT:    fill.h $w0, $1
-; MIPS32-NEXT:    fexupr.w $w0, $w0
-; MIPS32-NEXT:    copy_s.w $1, $w0[0]
-; MIPS32-NEXT:    lw $25, %call16(fminf)($gp)
-; MIPS32-NEXT:    jalr $25
-; MIPS32-NEXT:    mtc1 $1, $f12
-; MIPS32-NEXT:    mfc1 $1, $f0
-; MIPS32-NEXT:    fill.w $w0, $1
-; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
-; MIPS32-NEXT:    copy_u.h $1, $w0[0]
-; MIPS32-NEXT:    sh $1, 0($16)
-; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
-; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
-; MIPS32-NEXT:    jr $ra
-; MIPS32-NEXT:    addiu $sp, $sp, 24
+; MIPS32-O32-LABEL: fminnum:
+; MIPS32-O32:       # %bb.0: # %entry
+; MIPS32-O32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-O32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-O32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-O32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-O32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-O32-NEXT:    .cfi_offset 31, -4
+; MIPS32-O32-NEXT:    .cfi_offset 16, -8
+; MIPS32-O32-NEXT:    addu $gp, $2, $25
+; MIPS32-O32-NEXT:    mov.s $f14, $f12
+; MIPS32-O32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-O32-NEXT:    lh $1, 0($16)
+; MIPS32-O32-NEXT:    fill.h $w0, $1
+; MIPS32-O32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-O32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-O32-NEXT:    lw $25, %call16(fminf)($gp)
+; MIPS32-O32-NEXT:    jalr $25
+; MIPS32-O32-NEXT:    mtc1 $1, $f12
+; MIPS32-O32-NEXT:    mfc1 $1, $f0
+; MIPS32-O32-NEXT:    fill.w $w0, $1
+; MIPS32-O32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-O32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-O32-NEXT:    sh $1, 0($16)
+; MIPS32-O32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-O32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-O32-NEXT:    jr $ra
+; MIPS32-O32-NEXT:    addiu $sp, $sp, 24
 ;
-; MIPS64-N32-LABEL: fminnum:
-; MIPS64-N32:       # %bb.0: # %entry
-; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
-; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
-; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
-; MIPS64-N32-NEXT:    .cfi_offset 31, -8
-; MIPS64-N32-NEXT:    .cfi_offset 28, -16
-; MIPS64-N32-NEXT:    .cfi_offset 16, -24
-; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fminnum)))
-; MIPS64-N32-NEXT:    addu $1, $1, $25
-; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fminnum)))
-; MIPS64-N32-NEXT:    mov.s $f13, $f12
-; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
-; MIPS64-N32-NEXT:    lh $1, 0($16)
-; MIPS64-N32-NEXT:    fill.h $w0, $1
-; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
-; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
-; MIPS64-N32-NEXT:    lw $25, %call16(fminf)($gp)
-; MIPS64-N32-NEXT:    jalr $25
-; MIPS64-N32-NEXT:    mtc1 $1, $f12
-; MIPS64-N32-NEXT:    mfc1 $1, $f0
-; MIPS64-N32-NEXT:    fill.w $w0, $1
-; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
-; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
-; MIPS64-N32-NEXT:    sh $1, 0($16)
-; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
-; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64-N32-NEXT:    jr $ra
-; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+; MIPS64R5-N32-LABEL: fminnum:
+; MIPS64R5-N32:       # %bb.0: # %entry
+; MIPS64R5-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64R5-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64R5-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64R5-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64R5-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64R5-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64R5-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fminnum)))
+; MIPS64R5-N32-NEXT:    addu $1, $1, $25
+; MIPS64R5-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fminnum)))
+; MIPS64R5-N32-NEXT:    mov.s $f13, $f12
+; MIPS64R5-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64R5-N32-NEXT:    lh $1, 0($16)
+; MIPS64R5-N32-NEXT:    fill.h $w0, $1
+; MIPS64R5-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64R5-N32-NEXT:    lw $25, %call16(fminf)($gp)
+; MIPS64R5-N32-NEXT:    jalr $25
+; MIPS64R5-N32-NEXT:    mtc1 $1, $f12
+; MIPS64R5-N32-NEXT:    mfc1 $1, $f0
+; MIPS64R5-N32-NEXT:    fill.w $w0, $1
+; MIPS64R5-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64R5-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64R5-N32-NEXT:    sh $1, 0($16)
+; MIPS64R5-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64R5-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64R5-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64R5-N32-NEXT:    jr $ra
+; MIPS64R5-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64R5-N64-LABEL: fminnum:
+; MIPS64R5-N64:       # %bb.0: # %entry
+; MIPS64R5-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64R5-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64R5-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64R5-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64R5-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64R5-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64R5-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fminnum)))
+; MIPS64R5-N64-NEXT:    daddu $1, $1, $25
+; MIPS64R5-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fminnum)))
+; MIPS64R5-N64-NEXT:    mov.s $f13, $f12
+; MIPS64R5-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64R5-N64-NEXT:    lh $1, 0($16)
+; MIPS64R5-N64-NEXT:    fill.h $w0, $1
+; MIPS64R5-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64R5-N64-NEXT:    ld $25, %call16(fminf)($gp)
+; MIPS64R5-N64-NEXT:    jalr $25
+; MIPS64R5-N64-NEXT:    mtc1 $1, $f12
+; MIPS64R5-N64-NEXT:    mfc1 $1, $f0
+; MIPS64R5-N64-NEXT:    fill.w $w0, $1
+; MIPS64R5-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64R5-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64R5-N64-NEXT:    sh $1, 0($16)
+; MIPS64R5-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64R5-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64R5-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64R5-N64-NEXT:    jr $ra
+; MIPS64R5-N64-NEXT:    daddiu $sp, $sp, 32
+;
+; MIPSR6-O32-LABEL: fminnum:
+; MIPSR6-O32:       # %bb.0: # %entry
+; MIPSR6-O32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPSR6-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPSR6-O32-NEXT:    addu $1, $2, $25
+; MIPSR6-O32-NEXT:    lw $1, %got(g)($1)
+; MIPSR6-O32-NEXT:    lh $2, 0($1)
+; MIPSR6-O32-NEXT:    fill.h $w0, $2
+; MIPSR6-O32-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-O32-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-O32-NEXT:    mtc1 $2, $f0
+; MIPSR6-O32-NEXT:    min.s $f0, $f0, $f12
+; MIPSR6-O32-NEXT:    mfc1 $2, $f0
+; MIPSR6-O32-NEXT:    fill.w $w0, $2
+; MIPSR6-O32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-O32-NEXT:    copy_u.h $2, $w0[0]
+; MIPSR6-O32-NEXT:    jr $ra
+; MIPSR6-O32-NEXT:    sh $2, 0($1)
+;
+; MIPSR6-N32-LABEL: fminnum:
+; MIPSR6-N32:       # %bb.0: # %entry
+; MIPSR6-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fminnum)))
+; MIPSR6-N32-NEXT:    addu $1, $1, $25
+; MIPSR6-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fminnum)))
+; MIPSR6-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPSR6-N32-NEXT:    lh $2, 0($1)
+; MIPSR6-N32-NEXT:    fill.h $w0, $2
+; MIPSR6-N32-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-N32-NEXT:    mtc1 $2, $f0
+; MIPSR6-N32-NEXT:    min.s $f0, $f0, $f12
+; MIPSR6-N32-NEXT:    mfc1 $2, $f0
+; MIPSR6-N32-NEXT:    fill.w $w0, $2
+; MIPSR6-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPSR6-N32-NEXT:    jr $ra
+; MIPSR6-N32-NEXT:    sh $2, 0($1)
+;
+; MIPSR6-N64-LABEL: fminnum:
+; MIPSR6-N64:       # %bb.0: # %entry
+; MIPSR6-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fminnum)))
+; MIPSR6-N64-NEXT:    daddu $1, $1, $25
+; MIPSR6-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fminnum)))
+; MIPSR6-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPSR6-N64-NEXT:    lh $2, 0($1)
+; MIPSR6-N64-NEXT:    fill.h $w0, $2
+; MIPSR6-N64-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-N64-NEXT:    mtc1 $2, $f0
+; MIPSR6-N64-NEXT:    min.s $f0, $f0, $f12
+; MIPSR6-N64-NEXT:    mfc1 $2, $f0
+; MIPSR6-N64-NEXT:    fill.w $w0, $2
+; MIPSR6-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPSR6-N64-NEXT:    jr $ra
+; MIPSR6-N64-NEXT:    sh $2, 0($1)
 ;
-; MIPS64-N64-LABEL: fminnum:
-; MIPS64-N64:       # %bb.0: # %entry
-; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
-; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
-; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
-; MIPS64-N64-NEXT:    .cfi_offset 31, -8
-; MIPS64-N64-NEXT:    .cfi_offset 28, -16
-; MIPS64-N64-NEXT:    .cfi_offset 16, -24
-; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fminnum)))
-; MIPS64-N64-NEXT:    daddu $1, $1, $25
-; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fminnum)))
-; MIPS64-N64-NEXT:    mov.s $f13, $f12
-; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
-; MIPS64-N64-NEXT:    lh $1, 0($16)
-; MIPS64-N64-NEXT:    fill.h $w0, $1
-; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
-; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
-; MIPS64-N64-NEXT:    ld $25, %call16(fminf)($gp)
-; MIPS64-N64-NEXT:    jalr $25
-; MIPS64-N64-NEXT:    mtc1 $1, $f12
-; MIPS64-N64-NEXT:    mfc1 $1, $f0
-; MIPS64-N64-NEXT:    fill.w $w0, $1
-; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
-; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
-; MIPS64-N64-NEXT:    sh $1, 0($16)
-; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
-; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64-N64-NEXT:    jr $ra
-; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
   %0 = load i16, ptr @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
@@ -2477,101 +2535,158 @@ entry:
 declare float @llvm.maxnum.f32(float %Val, float %b)
 
 define void @fmaxnum(float %b) {
-; MIPS32-LABEL: fmaxnum:
-; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
-; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
-; MIPS32-NEXT:    addiu $sp, $sp, -24
-; MIPS32-NEXT:    .cfi_def_cfa_offset 24
-; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
-; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
-; MIPS32-NEXT:    .cfi_offset 31, -4
-; MIPS32-NEXT:    .cfi_offset 16, -8
-; MIPS32-NEXT:    addu $gp, $2, $25
-; MIPS32-NEXT:    mov.s $f14, $f12
-; MIPS32-NEXT:    lw $16, %got(g)($gp)
-; MIPS32-NEXT:    lh $1, 0($16)
-; MIPS32-NEXT:    fill.h $w0, $1
-; MIPS32-NEXT:    fexupr.w $w0, $w0
-; MIPS32-NEXT:    copy_s.w $1, $w0[0]
-; MIPS32-NEXT:    lw $25, %call16(fmaxf)($gp)
-; MIPS32-NEXT:    jalr $25
-; MIPS32-NEXT:    mtc1 $1, $f12
-; MIPS32-NEXT:    mfc1 $1, $f0
-; MIPS32-NEXT:    fill.w $w0, $1
-; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
-; MIPS32-NEXT:    copy_u.h $1, $w0[0]
-; MIPS32-NEXT:    sh $1, 0($16)
-; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
-; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
-; MIPS32-NEXT:    jr $ra
-; MIPS32-NEXT:    addiu $sp, $sp, 24
+; MIPS32-O32-LABEL: fmaxnum:
+; MIPS32-O32:       # %bb.0: # %entry
+; MIPS32-O32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-O32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-O32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-O32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-O32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-O32-NEXT:    .cfi_offset 31, -4
+; MIPS32-O32-NEXT:    .cfi_offset 16, -8
+; MIPS32-O32-NEXT:    addu $gp, $2, $25
+; MIPS32-O32-NEXT:    mov.s $f14, $f12
+; MIPS32-O32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-O32-NEXT:    lh $1, 0($16)
+; MIPS32-O32-NEXT:    fill.h $w0, $1
+; MIPS32-O32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-O32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-O32-NEXT:    lw $25, %call16(fmaxf)($gp)
+; MIPS32-O32-NEXT:    jalr $25
+; MIPS32-O32-NEXT:    mtc1 $1, $f12
+; MIPS32-O32-NEXT:    mfc1 $1, $f0
+; MIPS32-O32-NEXT:    fill.w $w0, $1
+; MIPS32-O32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-O32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-O32-NEXT:    sh $1, 0($16)
+; MIPS32-O32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-O32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-O32-NEXT:    jr $ra
+; MIPS32-O32-NEXT:    addiu $sp, $sp, 24
 ;
-; MIPS64-N32-LABEL: fmaxnum:
-; MIPS64-N32:       # %bb.0: # %entry
-; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
-; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
-; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
-; MIPS64-N32-NEXT:    .cfi_offset 31, -8
-; MIPS64-N32-NEXT:    .cfi_offset 28, -16
-; MIPS64-N32-NEXT:    .cfi_offset 16, -24
-; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fmaxnum)))
-; MIPS64-N32-NEXT:    addu $1, $1, $25
-; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fmaxnum)))
-; MIPS64-N32-NEXT:    mov.s $f13, $f12
-; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
-; MIPS64-N32-NEXT:    lh $1, 0($16)
-; MIPS64-N32-NEXT:    fill.h $w0, $1
-; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
-; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
-; MIPS64-N32-NEXT:    lw $25, %call16(fmaxf)($gp)
-; MIPS64-N32-NEXT:    jalr $25
-; MIPS64-N32-NEXT:    mtc1 $1, $f12
-; MIPS64-N32-NEXT:    mfc1 $1, $f0
-; MIPS64-N32-NEXT:    fill.w $w0, $1
-; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
-; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
-; MIPS64-N32-NEXT:    sh $1, 0($16)
-; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
-; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64-N32-NEXT:    jr $ra
-; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+; MIPS64R5-N32-LABEL: fmaxnum:
+; MIPS64R5-N32:       # %bb.0: # %entry
+; MIPS64R5-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64R5-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64R5-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64R5-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64R5-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64R5-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64R5-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fmaxnum)))
+; MIPS64R5-N32-NEXT:    addu $1, $1, $25
+; MIPS64R5-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fmaxnum)))
+; MIPS64R5-N32-NEXT:    mov.s $f13, $f12
+; MIPS64R5-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64R5-N32-NEXT:    lh $1, 0($16)
+; MIPS64R5-N32-NEXT:    fill.h $w0, $1
+; MIPS64R5-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64R5-N32-NEXT:    lw $25, %call16(fmaxf)($gp)
+; MIPS64R5-N32-NEXT:    jalr $25
+; MIPS64R5-N32-NEXT:    mtc1 $1, $f12
+; MIPS64R5-N32-NEXT:    mfc1 $1, $f0
+; MIPS64R5-N32-NEXT:    fill.w $w0, $1
+; MIPS64R5-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64R5-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64R5-N32-NEXT:    sh $1, 0($16)
+; MIPS64R5-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64R5-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64R5-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64R5-N32-NEXT:    jr $ra
+; MIPS64R5-N32-NEXT:    addiu $sp, $sp, 32
 ;
-; MIPS64-N64-LABEL: fmaxnum:
-; MIPS64-N64:       # %bb.0: # %entry
-; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
-; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
-; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
-; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
-; MIPS64-N64-NEXT:    .cfi_offset 31, -8
-; MIPS64-N64-NEXT:    .cfi_offset 28, -16
-; MIPS64-N64-NEXT:    .cfi_offset 16, -24
-; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fmaxnum)))
-; MIPS64-N64-NEXT:    daddu $1, $1, $25
-; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fmaxnum)))
-; MIPS64-N64-NEXT:    mov.s $f13, $f12
-; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
-; MIPS64-N64-NEXT:    lh $1, 0($16)
-; MIPS64-N64-NEXT:    fill.h $w0, $1
-; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
-; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
-; MIPS64-N64-NEXT:    ld $25, %call16(fmaxf)($gp)
-; MIPS64-N64-NEXT:    jalr $25
-; MIPS64-N64-NEXT:    mtc1 $1, $f12
-; MIPS64-N64-NEXT:    mfc1 $1, $f0
-; MIPS64-N64-NEXT:    fill.w $w0, $1
-; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
-; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
-; MIPS64-N64-NEXT:    sh $1, 0($16)
-; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
-; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
-; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
-; MIPS64-N64-NEXT:    jr $ra
-; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5-N64-LABEL: fmaxnum:
+; MIPS64R5-N64:       # %bb.0: # %entry
+; MIPS64R5-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64R5-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64R5-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64R5-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64R5-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64R5-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64R5-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64R5-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fmaxnum)))
+; MIPS64R5-N64-NEXT:    daddu $1, $1, $25
+; MIPS64R5-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fmaxnum)))
+; MIPS64R5-N64-NEXT:    mov.s $f13, $f12
+; MIPS64R5-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64R5-N64-NEXT:    lh $1, 0($16)
+; MIPS64R5-N64-NEXT:    fill.h $w0, $1
+; MIPS64R5-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64R5-N64-NEXT:    ld $25, %call16(fmaxf)($gp)
+; MIPS64R5-N64-NEXT:    jalr $25
+; MIPS64R5-N64-NEXT:    mtc1 $1, $f12
+; MIPS64R5-N64-NEXT:    mfc1 $1, $f0
+; MIPS64R5-N64-NEXT:    fill.w $w0, $1
+; MIPS64R5-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64R5-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64R5-N64-NEXT:    sh $1, 0($16)
+; MIPS64R5-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64R5-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64R5-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64R5-N64-NEXT:    jr $ra
+; MIPS64R5-N64-NEXT:    daddiu $sp, $sp, 32
+;
+; MIPSR6-O32-LABEL: fmaxnum:
+; MIPSR6-O32:       # %bb.0:
+; MIPSR6-O32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPSR6-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPSR6-O32-NEXT:    addu $1, $2, $25
+; MIPSR6-O32-NEXT:    lw $1, %got(g)($1)
+; MIPSR6-O32-NEXT:    lh $2, 0($1)
+; MIPSR6-O32-NEXT:    fill.h $w0, $2
+; MIPSR6-O32-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-O32-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-O32-NEXT:    mtc1 $2, $f0
+; MIPSR6-O32-NEXT:    max.s $f0, $f0, $f12
+; MIPSR6-O32-NEXT:    mfc1 $2, $f0
+; MIPSR6-O32-NEXT:    fill.w $w0, $2
+; MIPSR6-O32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-O32-NEXT:    copy_u.h $2, $w0[0]
+; MIPSR6-O32-NEXT:    jr $ra
+; MIPSR6-O32-NEXT:    sh $2, 0($1)
+;
+; MIPSR6-N32-LABEL: fmaxnum:
+; MIPSR6-N32:       # %bb.0:
+; MIPSR6-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fmaxnum)))
+; MIPSR6-N32-NEXT:    addu $1, $1, $25
+; MIPSR6-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fmaxnum)))
+; MIPSR6-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPSR6-N32-NEXT:    lh $2, 0($1)
+; MIPSR6-N32-NEXT:    fill.h $w0, $2
+; MIPSR6-N32-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-N32-NEXT:    mtc1 $2, $f0
+; MIPSR6-N32-NEXT:    max.s $f0, $f0, $f12
+; MIPSR6-N32-NEXT:    mfc1 $2, $f0
+; MIPSR6-N32-NEXT:    fill.w $w0, $2
+; MIPSR6-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPSR6-N32-NEXT:    jr $ra
+; MIPSR6-N32-NEXT:    sh $2, 0($1)
+;
+; MIPSR6-N64-LABEL: fmaxnum:
+; MIPSR6-N64:       # %bb.0:
+; MIPSR6-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fmaxnum)))
+; MIPSR6-N64-NEXT:    daddu $1, $1, $25
+; MIPSR6-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fmaxnum)))
+; MIPSR6-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPSR6-N64-NEXT:    lh $2, 0($1)
+; MIPSR6-N64-NEXT:    fill.h $w0, $2
+; MIPSR6-N64-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-N64-NEXT:    mtc1 $2, $f0
+; MIPSR6-N64-NEXT:    max.s $f0, $f0, $f12
+; MIPSR6-N64-NEXT:    mfc1 $2, $f0
+; MIPSR6-N64-NEXT:    fill.w $w0, $2
+; MIPSR6-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPSR6-N64-NEXT:    jr $ra
+; MIPSR6-N64-NEXT:    sh $2, 0($1)
 entry:
   %0 = load i16, ptr @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
diff --git a/llvm/test/CodeGen/PowerPC/fminimum-fmaximum-f128.ll b/llvm/test/CodeGen/PowerPC/fminimum-fmaximum-f128.ll
new file mode 100644
index 000000000000..6d9eb1337682
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/fminimum-fmaximum-f128.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck %s
+
+define fp128 @f128_minimum(fp128 %a, fp128 %b) {
+; CHECK-LABEL: f128_minimum:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xscmpuqp 0, 2, 3
+; CHECK-NEXT:    vmr 4, 2
+; CHECK-NEXT:    bge 0, .LBB0_8
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    bun 0, .LBB0_9
+; CHECK-NEXT:  .LBB0_2: # %entry
+; CHECK-NEXT:    xststdcqp 0, 2, 4
+; CHECK-NEXT:    bc 4, 2, .LBB0_10
+; CHECK-NEXT:  .LBB0_3: # %entry
+; CHECK-NEXT:    xststdcqp 0, 3, 4
+; CHECK-NEXT:    bc 12, 2, .LBB0_5
+; CHECK-NEXT:  .LBB0_4: # %entry
+; CHECK-NEXT:    vmr 3, 2
+; CHECK-NEXT:  .LBB0_5: # %entry
+; CHECK-NEXT:    addis 3, 2, .LCPI0_1@toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI0_1@toc@l
+; CHECK-NEXT:    lxv 34, 0(3)
+; CHECK-NEXT:    xscmpuqp 0, 4, 2
+; CHECK-NEXT:    beq 0, .LBB0_7
+; CHECK-NEXT:  # %bb.6: # %entry
+; CHECK-NEXT:    vmr 3, 4
+; CHECK-NEXT:  .LBB0_7: # %entry
+; CHECK-NEXT:    vmr 2, 3
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB0_8: # %entry
+; CHECK-NEXT:    vmr 4, 3
+; CHECK-NEXT:    bnu 0, .LBB0_2
+; CHECK-NEXT:  .LBB0_9:
+; CHECK-NEXT:    addis 3, 2, .LCPI0_0@toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI0_0@toc@l
+; CHECK-NEXT:    lxv 36, 0(3)
+; CHECK-NEXT:    xststdcqp 0, 2, 4
+; CHECK-NEXT:    bc 12, 2, .LBB0_3
+; CHECK-NEXT:  .LBB0_10: # %entry
+; CHECK-NEXT:    vmr 2, 4
+; CHECK-NEXT:    xststdcqp 0, 3, 4
+; CHECK-NEXT:    bc 4, 2, .LBB0_4
+; CHECK-NEXT:    b .LBB0_5
+entry:
+  %m = call fp128 @llvm.minimum.f128(fp128 %a, fp128 %b)
+  ret fp128 %m
+}
+
+define fp128 @f128_maximum(fp128 %a, fp128 %b) {
+; CHECK-LABEL: f128_maximum:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xscmpuqp 0, 2, 3
+; CHECK-NEXT:    vmr 4, 2
+; CHECK-NEXT:    ble 0, .LBB1_8
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    bun 0, .LBB1_9
+; CHECK-NEXT:  .LBB1_2: # %entry
+; CHECK-NEXT:    xststdcqp 0, 2, 8
+; CHECK-NEXT:    bc 4, 2, .LBB1_10
+; CHECK-NEXT:  .LBB1_3: # %entry
+; CHECK-NEXT:    xststdcqp 0, 3, 8
+; CHECK-NEXT:    bc 12, 2, .LBB1_5
+; CHECK-NEXT:  .LBB1_4: # %entry
+; CHECK-NEXT:    vmr 3, 2
+; CHECK-NEXT:  .LBB1_5: # %entry
+; CHECK-NEXT:    addis 3, 2, .LCPI1_1@toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI1_1@toc@l
+; CHECK-NEXT:    lxv 34, 0(3)
+; CHECK-NEXT:    xscmpuqp 0, 4, 2
+; CHECK-NEXT:    beq 0, .LBB1_7
+; CHECK-NEXT:  # %bb.6: # %entry
+; CHECK-NEXT:    vmr 3, 4
+; CHECK-NEXT:  .LBB1_7: # %entry
+; CHECK-NEXT:    vmr 2, 3
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB1_8: # %entry
+; CHECK-NEXT:    vmr 4, 3
+; CHECK-NEXT:    bnu 0, .LBB1_2
+; CHECK-NEXT:  .LBB1_9:
+; CHECK-NEXT:    addis 3, 2, .LCPI1_0@toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI1_0@toc@l
+; CHECK-NEXT:    lxv 36, 0(3)
+; CHECK-NEXT:    xststdcqp 0, 2, 8
+; CHECK-NEXT:    bc 12, 2, .LBB1_3
+; CHECK-NEXT:  .LBB1_10: # %entry
+; CHECK-NEXT:    vmr 2, 4
+; CHECK-NEXT:    xststdcqp 0, 3, 8
+; CHECK-NEXT:    bc 4, 2, .LBB1_4
+; CHECK-NEXT:    b .LBB1_5
+entry:
+  %m = call fp128 @llvm.maximum.f128(fp128 %a, fp128 %b)
+  ret fp128 %m
+}
+
+declare fp128 @llvm.minimum.f128(fp128, fp128)
+declare fp128 @llvm.maximum.f128(fp128, fp128)
diff --git a/llvm/test/CodeGen/PowerPC/fminimum-fmaximum.ll b/llvm/test/CodeGen/PowerPC/fminimum-fmaximum.ll
new file mode 100644
index 000000000000..c33875dbfee4
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/fminimum-fmaximum.ll
@@ -0,0 +1,847 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s --check-prefix=NOVSX
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s --check-prefix=VSX
+; RUN: llc -mtriple=powerpc64-ibm-aix -mcpu=pwr8 < %s | FileCheck %s --check-prefix=AIX
+
+define float @f32_minimum(float %a, float %b) {
+; NOVSX-LABEL: f32_minimum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    fcmpu 0, 1, 2
+; NOVSX-NEXT:    fmr 0, 1
+; NOVSX-NEXT:    stfs 2, -8(1)
+; NOVSX-NEXT:    stfs 1, -4(1)
+; NOVSX-NEXT:    bc 12, 0, .LBB0_2
+; NOVSX-NEXT:  # %bb.1: # %entry
+; NOVSX-NEXT:    fmr 0, 2
+; NOVSX-NEXT:  .LBB0_2: # %entry
+; NOVSX-NEXT:    lwz 3, -4(1)
+; NOVSX-NEXT:    bc 4, 3, .LBB0_4
+; NOVSX-NEXT:  # %bb.3:
+; NOVSX-NEXT:    addis 4, 2, .LCPI0_0@toc@ha
+; NOVSX-NEXT:    lfs 0, .LCPI0_0@toc@l(4)
+; NOVSX-NEXT:  .LBB0_4: # %entry
+; NOVSX-NEXT:    xoris 3, 3, 32768
+; NOVSX-NEXT:    lwz 4, -8(1)
+; NOVSX-NEXT:    cmplwi 3, 0
+; NOVSX-NEXT:    bc 12, 2, .LBB0_6
+; NOVSX-NEXT:  # %bb.5: # %entry
+; NOVSX-NEXT:    fmr 1, 0
+; NOVSX-NEXT:  .LBB0_6: # %entry
+; NOVSX-NEXT:    xoris 3, 4, 32768
+; NOVSX-NEXT:    cmplwi 3, 0
+; NOVSX-NEXT:    bc 12, 2, .LBB0_8
+; NOVSX-NEXT:  # %bb.7: # %entry
+; NOVSX-NEXT:    fmr 2, 1
+; NOVSX-NEXT:  .LBB0_8: # %entry
+; NOVSX-NEXT:    addis 3, 2, .LCPI0_1@toc@ha
+; NOVSX-NEXT:    lfs 1, .LCPI0_1@toc@l(3)
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB0_10
+; NOVSX-NEXT:  # %bb.9: # %entry
+; NOVSX-NEXT:    fmr 2, 0
+; NOVSX-NEXT:  .LBB0_10: # %entry
+; NOVSX-NEXT:    fmr 1, 2
+; NOVSX-NEXT:    blr
+;
+; VSX-LABEL: f32_minimum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xscvdpspn 0, 1
+; VSX-NEXT:    fcmpu 0, 1, 2
+; VSX-NEXT:    xscvdpspn 3, 2
+; VSX-NEXT:    mffprwz 3, 0
+; VSX-NEXT:    bc 12, 3, .LBB0_2
+; VSX-NEXT:  # %bb.1: # %entry
+; VSX-NEXT:    xsmindp 0, 1, 2
+; VSX-NEXT:    b .LBB0_3
+; VSX-NEXT:  .LBB0_2:
+; VSX-NEXT:    addis 4, 2, .LCPI0_0@toc@ha
+; VSX-NEXT:    lfs 0, .LCPI0_0@toc@l(4)
+; VSX-NEXT:  .LBB0_3: # %entry
+; VSX-NEXT:    xoris 3, 3, 32768
+; VSX-NEXT:    mffprwz 4, 3
+; VSX-NEXT:    cmplwi 3, 0
+; VSX-NEXT:    bc 12, 2, .LBB0_5
+; VSX-NEXT:  # %bb.4: # %entry
+; VSX-NEXT:    fmr 1, 0
+; VSX-NEXT:  .LBB0_5: # %entry
+; VSX-NEXT:    xoris 3, 4, 32768
+; VSX-NEXT:    cmplwi 3, 0
+; VSX-NEXT:    bc 12, 2, .LBB0_7
+; VSX-NEXT:  # %bb.6: # %entry
+; VSX-NEXT:    fmr 2, 1
+; VSX-NEXT:  .LBB0_7: # %entry
+; VSX-NEXT:    xxlxor 1, 1, 1
+; VSX-NEXT:    fcmpu 0, 0, 1
+; VSX-NEXT:    bc 12, 2, .LBB0_9
+; VSX-NEXT:  # %bb.8: # %entry
+; VSX-NEXT:    fmr 2, 0
+; VSX-NEXT:  .LBB0_9: # %entry
+; VSX-NEXT:    fmr 1, 2
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: f32_minimum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    xscvdpspn 0, 1
+; AIX-NEXT:    fcmpu 0, 1, 2
+; AIX-NEXT:    xscvdpspn 3, 2
+; AIX-NEXT:    mffprwz 3, 0
+; AIX-NEXT:    bc 12, 3, L..BB0_2
+; AIX-NEXT:  # %bb.1: # %entry
+; AIX-NEXT:    xsmindp 0, 1, 2
+; AIX-NEXT:    b L..BB0_3
+; AIX-NEXT:  L..BB0_2:
+; AIX-NEXT:    ld 4, L..C0(2) # %const.0
+; AIX-NEXT:    lfs 0, 0(4)
+; AIX-NEXT:  L..BB0_3: # %entry
+; AIX-NEXT:    xoris 3, 3, 32768
+; AIX-NEXT:    mffprwz 4, 3
+; AIX-NEXT:    cmplwi 3, 0
+; AIX-NEXT:    bc 12, 2, L..BB0_5
+; AIX-NEXT:  # %bb.4: # %entry
+; AIX-NEXT:    fmr 1, 0
+; AIX-NEXT:  L..BB0_5: # %entry
+; AIX-NEXT:    xoris 3, 4, 32768
+; AIX-NEXT:    cmplwi 3, 0
+; AIX-NEXT:    bc 12, 2, L..BB0_7
+; AIX-NEXT:  # %bb.6: # %entry
+; AIX-NEXT:    fmr 2, 1
+; AIX-NEXT:  L..BB0_7: # %entry
+; AIX-NEXT:    xxlxor 1, 1, 1
+; AIX-NEXT:    fcmpu 0, 0, 1
+; AIX-NEXT:    bc 12, 2, L..BB0_9
+; AIX-NEXT:  # %bb.8: # %entry
+; AIX-NEXT:    fmr 2, 0
+; AIX-NEXT:  L..BB0_9: # %entry
+; AIX-NEXT:    fmr 1, 2
+; AIX-NEXT:    blr
+entry:
+  %m = call float @llvm.minimum.f32(float %a, float %b)
+  ret float %m
+}
+
+define float @f32_maximum(float %a, float %b) {
+; NOVSX-LABEL: f32_maximum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    fcmpu 0, 1, 2
+; NOVSX-NEXT:    fmr 0, 1
+; NOVSX-NEXT:    stfs 2, -8(1)
+; NOVSX-NEXT:    stfs 1, -4(1)
+; NOVSX-NEXT:    bc 12, 1, .LBB1_2
+; NOVSX-NEXT:  # %bb.1: # %entry
+; NOVSX-NEXT:    fmr 0, 2
+; NOVSX-NEXT:  .LBB1_2: # %entry
+; NOVSX-NEXT:    lwz 3, -4(1)
+; NOVSX-NEXT:    bc 4, 3, .LBB1_4
+; NOVSX-NEXT:  # %bb.3:
+; NOVSX-NEXT:    addis 4, 2, .LCPI1_0@toc@ha
+; NOVSX-NEXT:    lfs 0, .LCPI1_0@toc@l(4)
+; NOVSX-NEXT:  .LBB1_4: # %entry
+; NOVSX-NEXT:    cmpwi 3, 0
+; NOVSX-NEXT:    lwz 4, -8(1)
+; NOVSX-NEXT:    bc 12, 2, .LBB1_6
+; NOVSX-NEXT:  # %bb.5: # %entry
+; NOVSX-NEXT:    fmr 1, 0
+; NOVSX-NEXT:  .LBB1_6: # %entry
+; NOVSX-NEXT:    cmpwi 4, 0
+; NOVSX-NEXT:    bc 12, 2, .LBB1_8
+; NOVSX-NEXT:  # %bb.7: # %entry
+; NOVSX-NEXT:    fmr 2, 1
+; NOVSX-NEXT:  .LBB1_8: # %entry
+; NOVSX-NEXT:    addis 3, 2, .LCPI1_1@toc@ha
+; NOVSX-NEXT:    lfs 1, .LCPI1_1@toc@l(3)
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB1_10
+; NOVSX-NEXT:  # %bb.9: # %entry
+; NOVSX-NEXT:    fmr 2, 0
+; NOVSX-NEXT:  .LBB1_10: # %entry
+; NOVSX-NEXT:    fmr 1, 2
+; NOVSX-NEXT:    blr
+;
+; VSX-LABEL: f32_maximum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xscvdpspn 0, 1
+; VSX-NEXT:    fcmpu 0, 1, 2
+; VSX-NEXT:    xscvdpspn 3, 2
+; VSX-NEXT:    mffprwz 3, 0
+; VSX-NEXT:    bc 12, 3, .LBB1_2
+; VSX-NEXT:  # %bb.1: # %entry
+; VSX-NEXT:    xsmaxdp 0, 1, 2
+; VSX-NEXT:    b .LBB1_3
+; VSX-NEXT:  .LBB1_2:
+; VSX-NEXT:    addis 4, 2, .LCPI1_0@toc@ha
+; VSX-NEXT:    lfs 0, .LCPI1_0@toc@l(4)
+; VSX-NEXT:  .LBB1_3: # %entry
+; VSX-NEXT:    mffprwz 4, 3
+; VSX-NEXT:    cmpwi 3, 0
+; VSX-NEXT:    bc 12, 2, .LBB1_5
+; VSX-NEXT:  # %bb.4: # %entry
+; VSX-NEXT:    fmr 1, 0
+; VSX-NEXT:  .LBB1_5: # %entry
+; VSX-NEXT:    cmpwi 4, 0
+; VSX-NEXT:    bc 12, 2, .LBB1_7
+; VSX-NEXT:  # %bb.6: # %entry
+; VSX-NEXT:    fmr 2, 1
+; VSX-NEXT:  .LBB1_7: # %entry
+; VSX-NEXT:    xxlxor 1, 1, 1
+; VSX-NEXT:    fcmpu 0, 0, 1
+; VSX-NEXT:    bc 12, 2, .LBB1_9
+; VSX-NEXT:  # %bb.8: # %entry
+; VSX-NEXT:    fmr 2, 0
+; VSX-NEXT:  .LBB1_9: # %entry
+; VSX-NEXT:    fmr 1, 2
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: f32_maximum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    xscvdpspn 0, 1
+; AIX-NEXT:    fcmpu 0, 1, 2
+; AIX-NEXT:    xscvdpspn 3, 2
+; AIX-NEXT:    mffprwz 3, 0
+; AIX-NEXT:    bc 12, 3, L..BB1_2
+; AIX-NEXT:  # %bb.1: # %entry
+; AIX-NEXT:    xsmaxdp 0, 1, 2
+; AIX-NEXT:    b L..BB1_3
+; AIX-NEXT:  L..BB1_2:
+; AIX-NEXT:    ld 4, L..C1(2) # %const.0
+; AIX-NEXT:    lfs 0, 0(4)
+; AIX-NEXT:  L..BB1_3: # %entry
+; AIX-NEXT:    mffprwz 4, 3
+; AIX-NEXT:    cmpwi 3, 0
+; AIX-NEXT:    bc 12, 2, L..BB1_5
+; AIX-NEXT:  # %bb.4: # %entry
+; AIX-NEXT:    fmr 1, 0
+; AIX-NEXT:  L..BB1_5: # %entry
+; AIX-NEXT:    cmpwi 4, 0
+; AIX-NEXT:    bc 12, 2, L..BB1_7
+; AIX-NEXT:  # %bb.6: # %entry
+; AIX-NEXT:    fmr 2, 1
+; AIX-NEXT:  L..BB1_7: # %entry
+; AIX-NEXT:    xxlxor 1, 1, 1
+; AIX-NEXT:    fcmpu 0, 0, 1
+; AIX-NEXT:    bc 12, 2, L..BB1_9
+; AIX-NEXT:  # %bb.8: # %entry
+; AIX-NEXT:    fmr 2, 0
+; AIX-NEXT:  L..BB1_9: # %entry
+; AIX-NEXT:    fmr 1, 2
+; AIX-NEXT:    blr
+entry:
+  %m = call float @llvm.maximum.f32(float %a, float %b)
+  ret float %m
+}
+
+define double @f64_minimum(double %a, double %b) {
+; NOVSX-LABEL: f64_minimum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    fcmpu 0, 1, 2
+; NOVSX-NEXT:    fmr 0, 1
+; NOVSX-NEXT:    stfd 2, -16(1)
+; NOVSX-NEXT:    stfd 1, -8(1)
+; NOVSX-NEXT:    bc 12, 0, .LBB2_2
+; NOVSX-NEXT:  # %bb.1: # %entry
+; NOVSX-NEXT:    fmr 0, 2
+; NOVSX-NEXT:  .LBB2_2: # %entry
+; NOVSX-NEXT:    ld 3, -8(1)
+; NOVSX-NEXT:    bc 4, 3, .LBB2_4
+; NOVSX-NEXT:  # %bb.3:
+; NOVSX-NEXT:    addis 4, 2, .LCPI2_0@toc@ha
+; NOVSX-NEXT:    lfs 0, .LCPI2_0@toc@l(4)
+; NOVSX-NEXT:  .LBB2_4: # %entry
+; NOVSX-NEXT:    li 5, 1
+; NOVSX-NEXT:    ld 4, -16(1)
+; NOVSX-NEXT:    rldic 5, 5, 63, 0
+; NOVSX-NEXT:    cmpd 3, 5
+; NOVSX-NEXT:    bc 12, 2, .LBB2_6
+; NOVSX-NEXT:  # %bb.5: # %entry
+; NOVSX-NEXT:    fmr 1, 0
+; NOVSX-NEXT:  .LBB2_6: # %entry
+; NOVSX-NEXT:    cmpd 4, 5
+; NOVSX-NEXT:    bc 12, 2, .LBB2_8
+; NOVSX-NEXT:  # %bb.7: # %entry
+; NOVSX-NEXT:    fmr 2, 1
+; NOVSX-NEXT:  .LBB2_8: # %entry
+; NOVSX-NEXT:    addis 3, 2, .LCPI2_1@toc@ha
+; NOVSX-NEXT:    lfs 1, .LCPI2_1@toc@l(3)
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB2_10
+; NOVSX-NEXT:  # %bb.9: # %entry
+; NOVSX-NEXT:    fmr 2, 0
+; NOVSX-NEXT:  .LBB2_10: # %entry
+; NOVSX-NEXT:    fmr 1, 2
+; NOVSX-NEXT:    blr
+;
+; VSX-LABEL: f64_minimum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    fcmpu 0, 1, 2
+; VSX-NEXT:    mffprd 3, 1
+; VSX-NEXT:    bc 12, 3, .LBB2_2
+; VSX-NEXT:  # %bb.1: # %entry
+; VSX-NEXT:    xsmindp 0, 1, 2
+; VSX-NEXT:    b .LBB2_3
+; VSX-NEXT:  .LBB2_2:
+; VSX-NEXT:    addis 4, 2, .LCPI2_0@toc@ha
+; VSX-NEXT:    lfs 0, .LCPI2_0@toc@l(4)
+; VSX-NEXT:  .LBB2_3: # %entry
+; VSX-NEXT:    li 5, 1
+; VSX-NEXT:    mffprd 4, 2
+; VSX-NEXT:    rldic 5, 5, 63, 0
+; VSX-NEXT:    cmpd 3, 5
+; VSX-NEXT:    bc 12, 2, .LBB2_5
+; VSX-NEXT:  # %bb.4: # %entry
+; VSX-NEXT:    fmr 1, 0
+; VSX-NEXT:  .LBB2_5: # %entry
+; VSX-NEXT:    cmpd 4, 5
+; VSX-NEXT:    bc 12, 2, .LBB2_7
+; VSX-NEXT:  # %bb.6: # %entry
+; VSX-NEXT:    fmr 2, 1
+; VSX-NEXT:  .LBB2_7: # %entry
+; VSX-NEXT:    xxlxor 1, 1, 1
+; VSX-NEXT:    fcmpu 0, 0, 1
+; VSX-NEXT:    bc 12, 2, .LBB2_9
+; VSX-NEXT:  # %bb.8: # %entry
+; VSX-NEXT:    fmr 2, 0
+; VSX-NEXT:  .LBB2_9: # %entry
+; VSX-NEXT:    fmr 1, 2
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: f64_minimum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    fcmpu 0, 1, 2
+; AIX-NEXT:    mffprd 3, 1
+; AIX-NEXT:    bc 12, 3, L..BB2_2
+; AIX-NEXT:  # %bb.1: # %entry
+; AIX-NEXT:    xsmindp 0, 1, 2
+; AIX-NEXT:    b L..BB2_3
+; AIX-NEXT:  L..BB2_2:
+; AIX-NEXT:    ld 4, L..C2(2) # %const.0
+; AIX-NEXT:    lfs 0, 0(4)
+; AIX-NEXT:  L..BB2_3: # %entry
+; AIX-NEXT:    li 5, 1
+; AIX-NEXT:    mffprd 4, 2
+; AIX-NEXT:    rldic 5, 5, 63, 0
+; AIX-NEXT:    cmpd 3, 5
+; AIX-NEXT:    bc 12, 2, L..BB2_5
+; AIX-NEXT:  # %bb.4: # %entry
+; AIX-NEXT:    fmr 1, 0
+; AIX-NEXT:  L..BB2_5: # %entry
+; AIX-NEXT:    cmpd 4, 5
+; AIX-NEXT:    bc 12, 2, L..BB2_7
+; AIX-NEXT:  # %bb.6: # %entry
+; AIX-NEXT:    fmr 2, 1
+; AIX-NEXT:  L..BB2_7: # %entry
+; AIX-NEXT:    xxlxor 1, 1, 1
+; AIX-NEXT:    fcmpu 0, 0, 1
+; AIX-NEXT:    bc 12, 2, L..BB2_9
+; AIX-NEXT:  # %bb.8: # %entry
+; AIX-NEXT:    fmr 2, 0
+; AIX-NEXT:  L..BB2_9: # %entry
+; AIX-NEXT:    fmr 1, 2
+; AIX-NEXT:    blr
+entry:
+  %m = call double @llvm.minimum.f64(double %a, double %b)
+  ret double %m
+}
+
+define double @f64_maximum(double %a, double %b) {
+; NOVSX-LABEL: f64_maximum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    fcmpu 0, 1, 2
+; NOVSX-NEXT:    fmr 0, 1
+; NOVSX-NEXT:    stfd 2, -16(1)
+; NOVSX-NEXT:    stfd 1, -8(1)
+; NOVSX-NEXT:    bc 12, 1, .LBB3_2
+; NOVSX-NEXT:  # %bb.1: # %entry
+; NOVSX-NEXT:    fmr 0, 2
+; NOVSX-NEXT:  .LBB3_2: # %entry
+; NOVSX-NEXT:    ld 3, -8(1)
+; NOVSX-NEXT:    bc 4, 3, .LBB3_4
+; NOVSX-NEXT:  # %bb.3:
+; NOVSX-NEXT:    addis 4, 2, .LCPI3_0@toc@ha
+; NOVSX-NEXT:    lfs 0, .LCPI3_0@toc@l(4)
+; NOVSX-NEXT:  .LBB3_4: # %entry
+; NOVSX-NEXT:    cmpdi 3, 0
+; NOVSX-NEXT:    ld 4, -16(1)
+; NOVSX-NEXT:    bc 12, 2, .LBB3_6
+; NOVSX-NEXT:  # %bb.5: # %entry
+; NOVSX-NEXT:    fmr 1, 0
+; NOVSX-NEXT:  .LBB3_6: # %entry
+; NOVSX-NEXT:    cmpdi 4, 0
+; NOVSX-NEXT:    bc 12, 2, .LBB3_8
+; NOVSX-NEXT:  # %bb.7: # %entry
+; NOVSX-NEXT:    fmr 2, 1
+; NOVSX-NEXT:  .LBB3_8: # %entry
+; NOVSX-NEXT:    addis 3, 2, .LCPI3_1@toc@ha
+; NOVSX-NEXT:    lfs 1, .LCPI3_1@toc@l(3)
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB3_10
+; NOVSX-NEXT:  # %bb.9: # %entry
+; NOVSX-NEXT:    fmr 2, 0
+; NOVSX-NEXT:  .LBB3_10: # %entry
+; NOVSX-NEXT:    fmr 1, 2
+; NOVSX-NEXT:    blr
+;
+; VSX-LABEL: f64_maximum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    fcmpu 0, 1, 2
+; VSX-NEXT:    mffprd 3, 1
+; VSX-NEXT:    bc 12, 3, .LBB3_2
+; VSX-NEXT:  # %bb.1: # %entry
+; VSX-NEXT:    xsmaxdp 0, 1, 2
+; VSX-NEXT:    b .LBB3_3
+; VSX-NEXT:  .LBB3_2:
+; VSX-NEXT:    addis 4, 2, .LCPI3_0@toc@ha
+; VSX-NEXT:    lfs 0, .LCPI3_0@toc@l(4)
+; VSX-NEXT:  .LBB3_3: # %entry
+; VSX-NEXT:    mffprd 4, 2
+; VSX-NEXT:    cmpdi 3, 0
+; VSX-NEXT:    bc 12, 2, .LBB3_5
+; VSX-NEXT:  # %bb.4: # %entry
+; VSX-NEXT:    fmr 1, 0
+; VSX-NEXT:  .LBB3_5: # %entry
+; VSX-NEXT:    cmpdi 4, 0
+; VSX-NEXT:    bc 12, 2, .LBB3_7
+; VSX-NEXT:  # %bb.6: # %entry
+; VSX-NEXT:    fmr 2, 1
+; VSX-NEXT:  .LBB3_7: # %entry
+; VSX-NEXT:    xxlxor 1, 1, 1
+; VSX-NEXT:    fcmpu 0, 0, 1
+; VSX-NEXT:    bc 12, 2, .LBB3_9
+; VSX-NEXT:  # %bb.8: # %entry
+; VSX-NEXT:    fmr 2, 0
+; VSX-NEXT:  .LBB3_9: # %entry
+; VSX-NEXT:    fmr 1, 2
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: f64_maximum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    fcmpu 0, 1, 2
+; AIX-NEXT:    mffprd 3, 1
+; AIX-NEXT:    bc 12, 3, L..BB3_2
+; AIX-NEXT:  # %bb.1: # %entry
+; AIX-NEXT:    xsmaxdp 0, 1, 2
+; AIX-NEXT:    b L..BB3_3
+; AIX-NEXT:  L..BB3_2:
+; AIX-NEXT:    ld 4, L..C3(2) # %const.0
+; AIX-NEXT:    lfs 0, 0(4)
+; AIX-NEXT:  L..BB3_3: # %entry
+; AIX-NEXT:    mffprd 4, 2
+; AIX-NEXT:    cmpdi 3, 0
+; AIX-NEXT:    bc 12, 2, L..BB3_5
+; AIX-NEXT:  # %bb.4: # %entry
+; AIX-NEXT:    fmr 1, 0
+; AIX-NEXT:  L..BB3_5: # %entry
+; AIX-NEXT:    cmpdi 4, 0
+; AIX-NEXT:    bc 12, 2, L..BB3_7
+; AIX-NEXT:  # %bb.6: # %entry
+; AIX-NEXT:    fmr 2, 1
+; AIX-NEXT:  L..BB3_7: # %entry
+; AIX-NEXT:    xxlxor 1, 1, 1
+; AIX-NEXT:    fcmpu 0, 0, 1
+; AIX-NEXT:    bc 12, 2, L..BB3_9
+; AIX-NEXT:  # %bb.8: # %entry
+; AIX-NEXT:    fmr 2, 0
+; AIX-NEXT:  L..BB3_9: # %entry
+; AIX-NEXT:    fmr 1, 2
+; AIX-NEXT:    blr
+entry:
+  %m = call double @llvm.maximum.f64(double %a, double %b)
+  ret double %m
+}
+
+define <4 x float> @v4f32_minimum(<4 x float> %a, <4 x float> %b) {
+; NOVSX-LABEL: v4f32_minimum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    vcmpeqfp 0, 3, 3
+; NOVSX-NEXT:    vcmpeqfp 1, 2, 2
+; NOVSX-NEXT:    addis 3, 2, .LCPI4_0@toc@ha
+; NOVSX-NEXT:    addi 3, 3, .LCPI4_0@toc@l
+; NOVSX-NEXT:    vnot 0, 0
+; NOVSX-NEXT:    vnot 1, 1
+; NOVSX-NEXT:    vspltisb 4, -1
+; NOVSX-NEXT:    vcmpgtfp 5, 3, 2
+; NOVSX-NEXT:    vslw 4, 4, 4
+; NOVSX-NEXT:    vor 0, 1, 0
+; NOVSX-NEXT:    lvx 1, 0, 3
+; NOVSX-NEXT:    vsel 5, 3, 2, 5
+; NOVSX-NEXT:    vsel 5, 5, 1, 0
+; NOVSX-NEXT:    vcmpequw 0, 2, 4
+; NOVSX-NEXT:    vcmpequw 4, 3, 4
+; NOVSX-NEXT:    vsel 2, 5, 2, 0
+; NOVSX-NEXT:    vsel 2, 2, 3, 4
+; NOVSX-NEXT:    vxor 3, 3, 3
+; NOVSX-NEXT:    vcmpeqfp 3, 5, 3
+; NOVSX-NEXT:    vsel 2, 5, 2, 3
+; NOVSX-NEXT:    blr
+;
+; VSX-LABEL: v4f32_minimum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xvcmpeqsp 1, 35, 35
+; VSX-NEXT:    xvcmpeqsp 2, 34, 34
+; VSX-NEXT:    addis 3, 2, .LCPI4_0@toc@ha
+; VSX-NEXT:    xxleqv 36, 36, 36
+; VSX-NEXT:    xvminsp 0, 34, 35
+; VSX-NEXT:    vslw 4, 4, 4
+; VSX-NEXT:    addi 3, 3, .LCPI4_0@toc@l
+; VSX-NEXT:    xxlnor 1, 1, 1
+; VSX-NEXT:    xxlnor 2, 2, 2
+; VSX-NEXT:    vcmpequw 5, 2, 4
+; VSX-NEXT:    xxlor 1, 2, 1
+; VSX-NEXT:    lxvd2x 2, 0, 3
+; VSX-NEXT:    xxsel 0, 0, 2, 1
+; VSX-NEXT:    xxlxor 2, 2, 2
+; VSX-NEXT:    xvcmpeqsp 2, 0, 2
+; VSX-NEXT:    xxsel 1, 0, 34, 37
+; VSX-NEXT:    vcmpequw 2, 3, 4
+; VSX-NEXT:    xxsel 1, 1, 35, 34
+; VSX-NEXT:    xxsel 34, 0, 1, 2
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: v4f32_minimum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    xvcmpeqsp 1, 35, 35
+; AIX-NEXT:    xvcmpeqsp 2, 34, 34
+; AIX-NEXT:    ld 3, L..C4(2) # %const.0
+; AIX-NEXT:    xxleqv 36, 36, 36
+; AIX-NEXT:    xvminsp 0, 34, 35
+; AIX-NEXT:    vslw 4, 4, 4
+; AIX-NEXT:    xxlnor 1, 1, 1
+; AIX-NEXT:    xxlnor 2, 2, 2
+; AIX-NEXT:    vcmpequw 5, 2, 4
+; AIX-NEXT:    xxlor 1, 2, 1
+; AIX-NEXT:    lxvw4x 2, 0, 3
+; AIX-NEXT:    xxsel 0, 0, 2, 1
+; AIX-NEXT:    xxlxor 2, 2, 2
+; AIX-NEXT:    xvcmpeqsp 2, 0, 2
+; AIX-NEXT:    xxsel 1, 0, 34, 37
+; AIX-NEXT:    vcmpequw 2, 3, 4
+; AIX-NEXT:    xxsel 1, 1, 35, 34
+; AIX-NEXT:    xxsel 34, 0, 1, 2
+; AIX-NEXT:    blr
+entry:
+  %m = call <4 x float> @llvm.minimum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %m
+}
+
+define <4 x float> @v4f32_maximum(<4 x float> %a, <4 x float> %b) {
+; NOVSX-LABEL: v4f32_maximum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    vcmpeqfp 5, 3, 3
+; NOVSX-NEXT:    vcmpeqfp 0, 2, 2
+; NOVSX-NEXT:    addis 3, 2, .LCPI5_0@toc@ha
+; NOVSX-NEXT:    addi 3, 3, .LCPI5_0@toc@l
+; NOVSX-NEXT:    vnot 5, 5
+; NOVSX-NEXT:    vnot 0, 0
+; NOVSX-NEXT:    vcmpgtfp 4, 2, 3
+; NOVSX-NEXT:    vor 5, 0, 5
+; NOVSX-NEXT:    lvx 0, 0, 3
+; NOVSX-NEXT:    vsel 4, 3, 2, 4
+; NOVSX-NEXT:    vsel 4, 4, 0, 5
+; NOVSX-NEXT:    vxor 5, 5, 5
+; NOVSX-NEXT:    vcmpequw 0, 2, 5
+; NOVSX-NEXT:    vsel 2, 4, 2, 0
+; NOVSX-NEXT:    vcmpequw 0, 3, 5
+; NOVSX-NEXT:    vsel 2, 2, 3, 0
+; NOVSX-NEXT:    vcmpeqfp 3, 4, 5
+; NOVSX-NEXT:    vsel 2, 4, 2, 3
+; NOVSX-NEXT:    blr
+;
+; VSX-LABEL: v4f32_maximum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    xvcmpeqsp 1, 35, 35
+; VSX-NEXT:    xvcmpeqsp 2, 34, 34
+; VSX-NEXT:    addis 3, 2, .LCPI5_0@toc@ha
+; VSX-NEXT:    addi 3, 3, .LCPI5_0@toc@l
+; VSX-NEXT:    xxlnor 1, 1, 1
+; VSX-NEXT:    xxlnor 2, 2, 2
+; VSX-NEXT:    xvmaxsp 0, 34, 35
+; VSX-NEXT:    xxlxor 36, 36, 36
+; VSX-NEXT:    vcmpequw 5, 2, 4
+; VSX-NEXT:    xxlor 1, 2, 1
+; VSX-NEXT:    lxvd2x 2, 0, 3
+; VSX-NEXT:    xxsel 0, 0, 2, 1
+; VSX-NEXT:    xvcmpeqsp 2, 0, 36
+; VSX-NEXT:    xxsel 1, 0, 34, 37
+; VSX-NEXT:    vcmpequw 2, 3, 4
+; VSX-NEXT:    xxsel 1, 1, 35, 34
+; VSX-NEXT:    xxsel 34, 0, 1, 2
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: v4f32_maximum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    xvcmpeqsp 1, 35, 35
+; AIX-NEXT:    xvcmpeqsp 2, 34, 34
+; AIX-NEXT:    ld 3, L..C5(2) # %const.0
+; AIX-NEXT:    xvmaxsp 0, 34, 35
+; AIX-NEXT:    xxlxor 36, 36, 36
+; AIX-NEXT:    xxlnor 1, 1, 1
+; AIX-NEXT:    xxlnor 2, 2, 2
+; AIX-NEXT:    vcmpequw 5, 2, 4
+; AIX-NEXT:    xxlor 1, 2, 1
+; AIX-NEXT:    lxvw4x 2, 0, 3
+; AIX-NEXT:    xxsel 0, 0, 2, 1
+; AIX-NEXT:    xvcmpeqsp 2, 0, 36
+; AIX-NEXT:    xxsel 1, 0, 34, 37
+; AIX-NEXT:    vcmpequw 2, 3, 4
+; AIX-NEXT:    xxsel 1, 1, 35, 34
+; AIX-NEXT:    xxsel 34, 0, 1, 2
+; AIX-NEXT:    blr
+entry:
+  %m = call <4 x float> @llvm.maximum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %m
+}
+
+define <2 x double> @v2f64_minimum(<2 x double> %a, <2 x double> %b) {
+; NOVSX-LABEL: v2f64_minimum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    fcmpu 0, 1, 3
+; NOVSX-NEXT:    fmr 6, 1
+; NOVSX-NEXT:    stfd 4, -16(1)
+; NOVSX-NEXT:    stfd 2, -8(1)
+; NOVSX-NEXT:    stfd 3, -32(1)
+; NOVSX-NEXT:    stfd 1, -24(1)
+; NOVSX-NEXT:    bc 12, 0, .LBB6_2
+; NOVSX-NEXT:  # %bb.1: # %entry
+; NOVSX-NEXT:    fmr 6, 3
+; NOVSX-NEXT:  .LBB6_2: # %entry
+; NOVSX-NEXT:    addis 3, 2, .LCPI6_0@toc@ha
+; NOVSX-NEXT:    ld 4, -24(1)
+; NOVSX-NEXT:    lfs 0, .LCPI6_0@toc@l(3)
+; NOVSX-NEXT:    fmr 5, 0
+; NOVSX-NEXT:    bc 12, 3, .LBB6_4
+; NOVSX-NEXT:  # %bb.3: # %entry
+; NOVSX-NEXT:    fmr 5, 6
+; NOVSX-NEXT:  .LBB6_4: # %entry
+; NOVSX-NEXT:    li 3, 1
+; NOVSX-NEXT:    ld 5, -32(1)
+; NOVSX-NEXT:    rldic 3, 3, 63, 0
+; NOVSX-NEXT:    cmpd 4, 3
+; NOVSX-NEXT:    bc 12, 2, .LBB6_6
+; NOVSX-NEXT:  # %bb.5: # %entry
+; NOVSX-NEXT:    fmr 1, 5
+; NOVSX-NEXT:  .LBB6_6: # %entry
+; NOVSX-NEXT:    cmpd 5, 3
+; NOVSX-NEXT:    bc 12, 2, .LBB6_8
+; NOVSX-NEXT:  # %bb.7: # %entry
+; NOVSX-NEXT:    fmr 3, 1
+; NOVSX-NEXT:  .LBB6_8: # %entry
+; NOVSX-NEXT:    addis 4, 2, .LCPI6_1@toc@ha
+; NOVSX-NEXT:    lfs 1, .LCPI6_1@toc@l(4)
+; NOVSX-NEXT:    fcmpu 0, 5, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB6_10
+; NOVSX-NEXT:  # %bb.9: # %entry
+; NOVSX-NEXT:    fmr 3, 5
+; NOVSX-NEXT:  .LBB6_10: # %entry
+; NOVSX-NEXT:    fcmpu 0, 2, 4
+; NOVSX-NEXT:    fmr 5, 2
+; NOVSX-NEXT:    bc 12, 0, .LBB6_12
+; NOVSX-NEXT:  # %bb.11: # %entry
+; NOVSX-NEXT:    fmr 5, 4
+; NOVSX-NEXT:  .LBB6_12: # %entry
+; NOVSX-NEXT:    ld 5, -8(1)
+; NOVSX-NEXT:    bc 12, 3, .LBB6_14
+; NOVSX-NEXT:  # %bb.13: # %entry
+; NOVSX-NEXT:    fmr 0, 5
+; NOVSX-NEXT:  .LBB6_14: # %entry
+; NOVSX-NEXT:    cmpd 5, 3
+; NOVSX-NEXT:    ld 4, -16(1)
+; NOVSX-NEXT:    bc 4, 2, .LBB6_19
+; NOVSX-NEXT:  # %bb.15: # %entry
+; NOVSX-NEXT:    cmpd 4, 3
+; NOVSX-NEXT:    bc 4, 2, .LBB6_20
+; NOVSX-NEXT:  .LBB6_16: # %entry
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB6_18
+; NOVSX-NEXT:  .LBB6_17: # %entry
+; NOVSX-NEXT:    fmr 4, 0
+; NOVSX-NEXT:  .LBB6_18: # %entry
+; NOVSX-NEXT:    fmr 1, 3
+; NOVSX-NEXT:    fmr 2, 4
+; NOVSX-NEXT:    blr
+; NOVSX-NEXT:  .LBB6_19: # %entry
+; NOVSX-NEXT:    fmr 2, 0
+; NOVSX-NEXT:    cmpd 4, 3
+; NOVSX-NEXT:    bc 12, 2, .LBB6_16
+; NOVSX-NEXT:  .LBB6_20: # %entry
+; NOVSX-NEXT:    fmr 4, 2
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 4, 2, .LBB6_17
+; NOVSX-NEXT:    b .LBB6_18
+;
+; VSX-LABEL: v2f64_minimum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    addis 3, 2, .LCPI6_0@toc@ha
+; VSX-NEXT:    xvcmpeqdp 36, 35, 35
+; VSX-NEXT:    xvcmpeqdp 37, 34, 34
+; VSX-NEXT:    addi 3, 3, .LCPI6_0@toc@l
+; VSX-NEXT:    xxlnor 36, 36, 36
+; VSX-NEXT:    xxlnor 37, 37, 37
+; VSX-NEXT:    xvmindp 0, 34, 35
+; VSX-NEXT:    lxvd2x 2, 0, 3
+; VSX-NEXT:    addis 3, 2, .LCPI6_1@toc@ha
+; VSX-NEXT:    xxlor 1, 37, 36
+; VSX-NEXT:    addi 3, 3, .LCPI6_1@toc@l
+; VSX-NEXT:    lxvd2x 36, 0, 3
+; VSX-NEXT:    vcmpequd 5, 2, 4
+; VSX-NEXT:    xxsel 0, 0, 2, 1
+; VSX-NEXT:    xxlxor 2, 2, 2
+; VSX-NEXT:    xxsel 1, 0, 34, 37
+; VSX-NEXT:    vcmpequd 2, 3, 4
+; VSX-NEXT:    xxsel 1, 1, 35, 34
+; VSX-NEXT:    xvcmpeqdp 34, 0, 2
+; VSX-NEXT:    xxsel 34, 0, 1, 34
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: v2f64_minimum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    ld 3, L..C6(2) # %const.0
+; AIX-NEXT:    xvcmpeqdp 36, 35, 35
+; AIX-NEXT:    xvcmpeqdp 37, 34, 34
+; AIX-NEXT:    lxvd2x 2, 0, 3
+; AIX-NEXT:    ld 3, L..C7(2) # %const.1
+; AIX-NEXT:    xxlnor 36, 36, 36
+; AIX-NEXT:    xxlnor 37, 37, 37
+; AIX-NEXT:    xvmindp 0, 34, 35
+; AIX-NEXT:    xxlor 1, 37, 36
+; AIX-NEXT:    lxvd2x 36, 0, 3
+; AIX-NEXT:    vcmpequd 5, 2, 4
+; AIX-NEXT:    xxsel 0, 0, 2, 1
+; AIX-NEXT:    xxlxor 2, 2, 2
+; AIX-NEXT:    xxsel 1, 0, 34, 37
+; AIX-NEXT:    vcmpequd 2, 3, 4
+; AIX-NEXT:    xxsel 1, 1, 35, 34
+; AIX-NEXT:    xvcmpeqdp 34, 0, 2
+; AIX-NEXT:    xxsel 34, 0, 1, 34
+; AIX-NEXT:    blr
+entry:
+  %m = call <2 x double> @llvm.minimum.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %m
+}
+
+define <2 x double> @v2f64_maximum(<2 x double> %a, <2 x double> %b) {
+; NOVSX-LABEL: v2f64_maximum:
+; NOVSX:       # %bb.0: # %entry
+; NOVSX-NEXT:    fcmpu 0, 1, 3
+; NOVSX-NEXT:    fmr 6, 1
+; NOVSX-NEXT:    stfd 4, -16(1)
+; NOVSX-NEXT:    stfd 2, -8(1)
+; NOVSX-NEXT:    stfd 3, -32(1)
+; NOVSX-NEXT:    stfd 1, -24(1)
+; NOVSX-NEXT:    bc 12, 1, .LBB7_2
+; NOVSX-NEXT:  # %bb.1: # %entry
+; NOVSX-NEXT:    fmr 6, 3
+; NOVSX-NEXT:  .LBB7_2: # %entry
+; NOVSX-NEXT:    addis 4, 2, .LCPI7_0@toc@ha
+; NOVSX-NEXT:    ld 3, -24(1)
+; NOVSX-NEXT:    lfs 0, .LCPI7_0@toc@l(4)
+; NOVSX-NEXT:    fmr 5, 0
+; NOVSX-NEXT:    bc 12, 3, .LBB7_4
+; NOVSX-NEXT:  # %bb.3: # %entry
+; NOVSX-NEXT:    fmr 5, 6
+; NOVSX-NEXT:  .LBB7_4: # %entry
+; NOVSX-NEXT:    cmpdi 3, 0
+; NOVSX-NEXT:    ld 4, -32(1)
+; NOVSX-NEXT:    bc 12, 2, .LBB7_6
+; NOVSX-NEXT:  # %bb.5: # %entry
+; NOVSX-NEXT:    fmr 1, 5
+; NOVSX-NEXT:  .LBB7_6: # %entry
+; NOVSX-NEXT:    cmpdi 4, 0
+; NOVSX-NEXT:    bc 12, 2, .LBB7_8
+; NOVSX-NEXT:  # %bb.7: # %entry
+; NOVSX-NEXT:    fmr 3, 1
+; NOVSX-NEXT:  .LBB7_8: # %entry
+; NOVSX-NEXT:    addis 3, 2, .LCPI7_1@toc@ha
+; NOVSX-NEXT:    lfs 1, .LCPI7_1@toc@l(3)
+; NOVSX-NEXT:    fcmpu 0, 5, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB7_10
+; NOVSX-NEXT:  # %bb.9: # %entry
+; NOVSX-NEXT:    fmr 3, 5
+; NOVSX-NEXT:  .LBB7_10: # %entry
+; NOVSX-NEXT:    fcmpu 0, 2, 4
+; NOVSX-NEXT:    fmr 5, 2
+; NOVSX-NEXT:    bc 12, 1, .LBB7_12
+; NOVSX-NEXT:  # %bb.11: # %entry
+; NOVSX-NEXT:    fmr 5, 4
+; NOVSX-NEXT:  .LBB7_12: # %entry
+; NOVSX-NEXT:    ld 4, -8(1)
+; NOVSX-NEXT:    bc 12, 3, .LBB7_14
+; NOVSX-NEXT:  # %bb.13: # %entry
+; NOVSX-NEXT:    fmr 0, 5
+; NOVSX-NEXT:  .LBB7_14: # %entry
+; NOVSX-NEXT:    cmpdi 4, 0
+; NOVSX-NEXT:    ld 3, -16(1)
+; NOVSX-NEXT:    bc 4, 2, .LBB7_19
+; NOVSX-NEXT:  # %bb.15: # %entry
+; NOVSX-NEXT:    cmpdi 3, 0
+; NOVSX-NEXT:    bc 4, 2, .LBB7_20
+; NOVSX-NEXT:  .LBB7_16: # %entry
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 12, 2, .LBB7_18
+; NOVSX-NEXT:  .LBB7_17: # %entry
+; NOVSX-NEXT:    fmr 4, 0
+; NOVSX-NEXT:  .LBB7_18: # %entry
+; NOVSX-NEXT:    fmr 1, 3
+; NOVSX-NEXT:    fmr 2, 4
+; NOVSX-NEXT:    blr
+; NOVSX-NEXT:  .LBB7_19: # %entry
+; NOVSX-NEXT:    fmr 2, 0
+; NOVSX-NEXT:    cmpdi 3, 0
+; NOVSX-NEXT:    bc 12, 2, .LBB7_16
+; NOVSX-NEXT:  .LBB7_20: # %entry
+; NOVSX-NEXT:    fmr 4, 2
+; NOVSX-NEXT:    fcmpu 0, 0, 1
+; NOVSX-NEXT:    bc 4, 2, .LBB7_17
+; NOVSX-NEXT:    b .LBB7_18
+;
+; VSX-LABEL: v2f64_maximum:
+; VSX:       # %bb.0: # %entry
+; VSX-NEXT:    addis 3, 2, .LCPI7_0@toc@ha
+; VSX-NEXT:    xvcmpeqdp 36, 35, 35
+; VSX-NEXT:    xvcmpeqdp 37, 34, 34
+; VSX-NEXT:    addi 3, 3, .LCPI7_0@toc@l
+; VSX-NEXT:    xxlnor 36, 36, 36
+; VSX-NEXT:    xxlnor 37, 37, 37
+; VSX-NEXT:    xvmaxdp 0, 34, 35
+; VSX-NEXT:    lxvd2x 2, 0, 3
+; VSX-NEXT:    xxlor 1, 37, 36
+; VSX-NEXT:    xxlxor 36, 36, 36
+; VSX-NEXT:    vcmpequd 5, 2, 4
+; VSX-NEXT:    xxsel 0, 0, 2, 1
+; VSX-NEXT:    xxsel 1, 0, 34, 37
+; VSX-NEXT:    vcmpequd 2, 3, 4
+; VSX-NEXT:    xxsel 1, 1, 35, 34
+; VSX-NEXT:    xvcmpeqdp 34, 0, 36
+; VSX-NEXT:    xxsel 34, 0, 1, 34
+; VSX-NEXT:    blr
+;
+; AIX-LABEL: v2f64_maximum:
+; AIX:       # %bb.0: # %entry
+; AIX-NEXT:    ld 3, L..C8(2) # %const.0
+; AIX-NEXT:    xvcmpeqdp 36, 35, 35
+; AIX-NEXT:    xvcmpeqdp 37, 34, 34
+; AIX-NEXT:    lxvd2x 2, 0, 3
+; AIX-NEXT:    xxlnor 36, 36, 36
+; AIX-NEXT:    xxlnor 37, 37, 37
+; AIX-NEXT:    xvmaxdp 0, 34, 35
+; AIX-NEXT:    xxlor 1, 37, 36
+; AIX-NEXT:    xxlxor 36, 36, 36
+; AIX-NEXT:    vcmpequd 5, 2, 4
+; AIX-NEXT:    xxsel 0, 0, 2, 1
+; AIX-NEXT:    xxsel 1, 0, 34, 37
+; AIX-NEXT:    vcmpequd 2, 3, 4
+; AIX-NEXT:    xxsel 1, 1, 35, 34
+; AIX-NEXT:    xvcmpeqdp 34, 0, 36
+; AIX-NEXT:    xxsel 34, 0, 1, 34
+; AIX-NEXT:    blr
+entry:
+  %m = call <2 x double> @llvm.maximum.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %m
+}
+
+declare float @llvm.maximum.f32(float, float)
+declare double @llvm.maximum.f64(double, double)
+declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
+
+declare float @llvm.minimum.f32(float, float)
+declare double @llvm.minimum.f64(double, double)
+declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/PowerPC/peephole-counter-XToI.mir b/llvm/test/CodeGen/PowerPC/peephole-counter-XToI.mir
new file mode 100644
index 000000000000..d8f2b08adaf2
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/peephole-counter-XToI.mir
@@ -0,0 +1,85 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# REQUIRES: asserts
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - | FileCheck %s --check-prefix=ALL
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole-skip=0,ppc-xtoi-peephole-count=8 \
+# RUN:   | FileCheck %s --check-prefix=ALL
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole-skip=3,ppc-xtoi-peephole-count=2 \
+# RUN:   | FileCheck %s --check-prefix=ONE-FIRSTSTORE
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole-skip=5,ppc-xtoi-peephole-count=2 \
+# RUN:   | FileCheck %s --check-prefix=ONE-SECONDSTORE
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-xtoi-peephole-skip=3,ppc-xtoi-peephole-count=4 \
+# RUN:   | FileCheck %s --check-prefix=TWO
+
+---
+name: foldDForm
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x3
+
+    ; ALL-LABEL: name: foldDForm
+    ; ALL: liveins: $x3
+    ; ALL-NEXT: {{  $}}
+    ; ALL-NEXT: [[COPY:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY killed $x3
+    ; ALL-NEXT: dead [[ADDI8_:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY]], 144
+    ; ALL-NEXT: [[LI8_:%[0-9]+]]:g8rc = LI8 0
+    ; ALL-NEXT: STD [[LI8_]], 160, [[COPY]]
+    ; ALL-NEXT: dead [[ADDI8_1:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY]], 160
+    ; ALL-NEXT: STD [[LI8_]], 176, [[COPY]]
+    ; ALL-NEXT: dead [[ADDI8_2:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY]], 176
+    ; ALL-NEXT: STD killed [[LI8_]], 192, killed [[COPY]]
+    ; ALL-NEXT: BLR8 implicit $lr8, implicit $rm
+    ;
+    ; ONE-FIRSTSTORE-LABEL: name: foldDForm
+    ; ONE-FIRSTSTORE: liveins: $x3
+    ; ONE-FIRSTSTORE-NEXT: {{  $}}
+    ; ONE-FIRSTSTORE-NEXT: [[COPY:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY killed $x3
+    ; ONE-FIRSTSTORE-NEXT: dead [[ADDI8_:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY]], 144
+    ; ONE-FIRSTSTORE-NEXT: [[LI8_:%[0-9]+]]:g8rc = LI8 0
+    ; ONE-FIRSTSTORE-NEXT: STD [[LI8_]], 160, [[COPY]]
+    ; ONE-FIRSTSTORE-NEXT: [[ADDI8_1:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY]], 160
+    ; ONE-FIRSTSTORE-NEXT: STD [[LI8_]], 16, killed [[ADDI8_1]]
+    ; ONE-FIRSTSTORE-NEXT: [[ADDI8_2:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 killed [[COPY]], 176
+    ; ONE-FIRSTSTORE-NEXT: STD killed [[LI8_]], 16, killed [[ADDI8_2]]
+    ; ONE-FIRSTSTORE-NEXT: BLR8 implicit $lr8, implicit $rm
+    ;
+    ; ONE-SECONDSTORE-LABEL: name: foldDForm
+    ; ONE-SECONDSTORE: liveins: $x3
+    ; ONE-SECONDSTORE-NEXT: {{  $}}
+    ; ONE-SECONDSTORE-NEXT: [[COPY:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY killed $x3
+    ; ONE-SECONDSTORE-NEXT: [[ADDI8_:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY]], 144
+    ; ONE-SECONDSTORE-NEXT: [[LI8_:%[0-9]+]]:g8rc = LI8 0
+    ; ONE-SECONDSTORE-NEXT: STD [[LI8_]], 16, killed [[ADDI8_]]
+    ; ONE-SECONDSTORE-NEXT: dead [[ADDI8_1:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY]], 160
+    ; ONE-SECONDSTORE-NEXT: STD [[LI8_]], 176, [[COPY]]
+    ; ONE-SECONDSTORE-NEXT: [[ADDI8_2:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 killed [[COPY]], 176
+    ; ONE-SECONDSTORE-NEXT: STD killed [[LI8_]], 16, killed [[ADDI8_2]]
+    ; ONE-SECONDSTORE-NEXT: BLR8 implicit $lr8, implicit $rm
+    ;
+    ; TWO-LABEL: name: foldDForm
+    ; TWO: liveins: $x3
+    ; TWO-NEXT: {{  $}}
+    ; TWO-NEXT: [[COPY:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY killed $x3
+    ; TWO-NEXT: dead [[ADDI8_:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY]], 144
+    ; TWO-NEXT: [[LI8_:%[0-9]+]]:g8rc = LI8 0
+    ; TWO-NEXT: STD [[LI8_]], 160, [[COPY]]
+    ; TWO-NEXT: dead [[ADDI8_1:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY]], 160
+    ; TWO-NEXT: STD [[LI8_]], 176, [[COPY]]
+    ; TWO-NEXT: [[ADDI8_2:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 killed [[COPY]], 176
+    ; TWO-NEXT: STD killed [[LI8_]], 16, killed [[ADDI8_2]]
+    ; TWO-NEXT: BLR8 implicit $lr8, implicit $rm
+    %0:g8rc_and_g8rc_nox0 = COPY $x3
+    %1:g8rc_and_g8rc_nox0 = ADDI8 %0:g8rc_and_g8rc_nox0, 144
+    %2:g8rc = LI8 0
+    STD %2:g8rc, 16, %1:g8rc_and_g8rc_nox0
+    %3:g8rc_and_g8rc_nox0 = ADDI8 %0:g8rc_and_g8rc_nox0, 160
+    STD %2:g8rc, 16, %3:g8rc_and_g8rc_nox0
+    %4:g8rc_and_g8rc_nox0 = ADDI8 %0:g8rc_and_g8rc_nox0, 176
+    STD killed %2:g8rc, 16, %4:g8rc_and_g8rc_nox0
+    BLR8 implicit $lr8, implicit $rm
+...
diff --git a/llvm/test/CodeGen/PowerPC/peephole-counter-perOp.mir b/llvm/test/CodeGen/PowerPC/peephole-counter-perOp.mir
new file mode 100644
index 000000000000..cf3ff291e26c
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/peephole-counter-perOp.mir
@@ -0,0 +1,74 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# REQUIRES: asserts
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - | FileCheck %s --check-prefix=ALL
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole-skip=0,ppc-per-op-peephole-count=6 \
+# RUN:   | FileCheck %s --check-prefix=ALL
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole-skip=3,ppc-per-op-peephole-count=1 \
+# RUN:   | FileCheck %s --check-prefix=ONE-FIRST-RLWINM
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole-skip=4,ppc-per-op-peephole-count=1 \
+# RUN:   | FileCheck %s --check-prefix=ONE-SECOND-RLWINM
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -debug-counter=ppc-per-op-peephole-skip=3,ppc-per-op-peephole-count=2 \
+# RUN:   | FileCheck %s --check-prefix=TWO
+
+---
+name: testFoldRLWINM
+#CHECK: name: testFoldRLWINM
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $x3
+    ; ALL-LABEL: name: testFoldRLWINM
+    ; ALL: liveins: $x3
+    ; ALL-NEXT: {{  $}}
+    ; ALL-NEXT: [[COPY:%[0-9]+]]:g8rc = COPY killed $x3
+    ; ALL-NEXT: [[COPY1:%[0-9]+]]:gprc = COPY killed [[COPY]].sub_32
+    ; ALL-NEXT: dead [[RLWINM:%[0-9]+]]:gprc = RLWINM [[COPY1]], 14, 0, 12
+    ; ALL-NEXT: dead [[RLWINM1:%[0-9]+]]:gprc = RLWINM [[COPY1]], 14, 0, 11
+    ; ALL-NEXT: dead [[RLWINM2:%[0-9]+]]:gprc = RLWINM killed [[COPY1]], 14, 0, 10
+    ; ALL-NEXT: BLR8 implicit $lr8, implicit $rm
+    ;
+    ; ONE-FIRST-RLWINM-LABEL: name: testFoldRLWINM
+    ; ONE-FIRST-RLWINM: liveins: $x3
+    ; ONE-FIRST-RLWINM-NEXT: {{  $}}
+    ; ONE-FIRST-RLWINM-NEXT: [[COPY:%[0-9]+]]:g8rc = COPY killed $x3
+    ; ONE-FIRST-RLWINM-NEXT: [[COPY1:%[0-9]+]]:gprc = COPY killed [[COPY]].sub_32
+    ; ONE-FIRST-RLWINM-NEXT: [[RLWINM:%[0-9]+]]:gprc = RLWINM [[COPY1]], 27, 5, 31
+    ; ONE-FIRST-RLWINM-NEXT: dead [[RLWINM1:%[0-9]+]]:gprc = RLWINM killed [[COPY1]], 14, 0, 12
+    ; ONE-FIRST-RLWINM-NEXT: dead [[RLWINM2:%[0-9]+]]:gprc = RLWINM [[RLWINM]], 19, 0, 11
+    ; ONE-FIRST-RLWINM-NEXT: dead [[RLWINM3:%[0-9]+]]:gprc = RLWINM killed [[RLWINM]], 19, 0, 10
+    ; ONE-FIRST-RLWINM-NEXT: BLR8 implicit $lr8, implicit $rm
+    ;
+    ; ONE-SECOND-RLWINM-LABEL: name: testFoldRLWINM
+    ; ONE-SECOND-RLWINM: liveins: $x3
+    ; ONE-SECOND-RLWINM-NEXT: {{  $}}
+    ; ONE-SECOND-RLWINM-NEXT: [[COPY:%[0-9]+]]:g8rc = COPY killed $x3
+    ; ONE-SECOND-RLWINM-NEXT: [[COPY1:%[0-9]+]]:gprc = COPY killed [[COPY]].sub_32
+    ; ONE-SECOND-RLWINM-NEXT: [[RLWINM:%[0-9]+]]:gprc = RLWINM [[COPY1]], 27, 5, 31
+    ; ONE-SECOND-RLWINM-NEXT: dead [[RLWINM1:%[0-9]+]]:gprc = RLWINM [[RLWINM]], 19, 0, 12
+    ; ONE-SECOND-RLWINM-NEXT: dead [[RLWINM2:%[0-9]+]]:gprc = RLWINM killed [[COPY1]], 14, 0, 11
+    ; ONE-SECOND-RLWINM-NEXT: dead [[RLWINM3:%[0-9]+]]:gprc = RLWINM killed [[RLWINM]], 19, 0, 10
+    ; ONE-SECOND-RLWINM-NEXT: BLR8 implicit $lr8, implicit $rm
+    ;
+    ; TWO-LABEL: name: testFoldRLWINM
+    ; TWO: liveins: $x3
+    ; TWO-NEXT: {{  $}}
+    ; TWO-NEXT: [[COPY:%[0-9]+]]:g8rc = COPY killed $x3
+    ; TWO-NEXT: [[COPY1:%[0-9]+]]:gprc = COPY killed [[COPY]].sub_32
+    ; TWO-NEXT: [[RLWINM:%[0-9]+]]:gprc = RLWINM [[COPY1]], 27, 5, 31
+    ; TWO-NEXT: dead [[RLWINM1:%[0-9]+]]:gprc = RLWINM [[COPY1]], 14, 0, 12
+    ; TWO-NEXT: dead [[RLWINM2:%[0-9]+]]:gprc = RLWINM killed [[COPY1]], 14, 0, 11
+    ; TWO-NEXT: dead [[RLWINM3:%[0-9]+]]:gprc = RLWINM killed [[RLWINM]], 19, 0, 10
+    ; TWO-NEXT: BLR8 implicit $lr8, implicit $rm
+    %0:g8rc = COPY $x3
+    %1:gprc = COPY %0.sub_32:g8rc
+    %2:gprc = RLWINM %1:gprc, 27, 5, 31
+    %3:gprc = RLWINM %2:gprc, 19, 0, 12
+    %4:gprc = RLWINM %2:gprc, 19, 0, 11
+    %5:gprc = RLWINM %2:gprc, 19, 0, 10
+    BLR8 implicit $lr8, implicit $rm
+...
diff --git a/llvm/test/CodeGen/RISCV/alu64.ll b/llvm/test/CodeGen/RISCV/alu64.ll
index d2ee80e6aa95..f032756e007b 100644
--- a/llvm/test/CodeGen/RISCV/alu64.ll
+++ b/llvm/test/CodeGen/RISCV/alu64.ll
@@ -57,8 +57,8 @@ define i64 @sltiu(i64 %a) nounwind {
 ;
 ; RV32I-LABEL: sltiu:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    sltiu a0, a0, 3
+; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    and a0, a1, a0
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
index f96e1bad2e38..a5a2ae79966c 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
@@ -372,10 +372,10 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
 ; RV32IA-NEXT:    # =>This Loop Header: Depth=1
 ; RV32IA-NEXT:    # Child Loop BB2_3 Depth 2
 ; RV32IA-NEXT:    mv a3, a2
-; RV32IA-NEXT:    addi a4, a2, 1
-; RV32IA-NEXT:    sltu a2, a2, a1
-; RV32IA-NEXT:    neg a2, a2
-; RV32IA-NEXT:    and a4, a2, a4
+; RV32IA-NEXT:    addi a2, a2, 1
+; RV32IA-NEXT:    sltu a4, a3, a1
+; RV32IA-NEXT:    neg a4, a4
+; RV32IA-NEXT:    and a4, a4, a2
 ; RV32IA-NEXT:  .LBB2_3: # %atomicrmw.start
 ; RV32IA-NEXT:    # Parent Loop BB2_1 Depth=1
 ; RV32IA-NEXT:    # => This Inner Loop Header: Depth=2
@@ -607,10 +607,10 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; RV64IA-NEXT:    # =>This Loop Header: Depth=1
 ; RV64IA-NEXT:    # Child Loop BB3_3 Depth 2
 ; RV64IA-NEXT:    mv a3, a2
-; RV64IA-NEXT:    addi a4, a2, 1
-; RV64IA-NEXT:    sltu a2, a2, a1
-; RV64IA-NEXT:    neg a2, a2
-; RV64IA-NEXT:    and a4, a2, a4
+; RV64IA-NEXT:    addi a2, a2, 1
+; RV64IA-NEXT:    sltu a4, a3, a1
+; RV64IA-NEXT:    neg a4, a4
+; RV64IA-NEXT:    and a4, a4, a2
 ; RV64IA-NEXT:  .LBB3_3: # %atomicrmw.start
 ; RV64IA-NEXT:    # Parent Loop BB3_1 Depth=1
 ; RV64IA-NEXT:    # => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index 141d5ea41828..7bd3440c9dc0 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -256,6 +256,19 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-supm %s -o - | FileCheck --check-prefix=RV64SUPM %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-ssqosid %s -o - | FileCheck --check-prefix=RV64SSQOSID %s
 
+; Tests for profile features.
+; RUN: llc -mtriple=riscv32 -mattr=+rvi20u32 %s -o - | FileCheck --check-prefix=RVI20U32 %s
+; RUN: llc -mtriple=riscv64 -mattr=+rvi20u64 %s -o - | FileCheck --check-prefix=RVI20U64 %s
+; RUN: llc -mtriple=riscv64 -mattr=+rva20u64 %s -o - | FileCheck --check-prefix=RVA20U64 %s
+; RUN: llc -mtriple=riscv64 -mattr=+rva20s64 %s -o - | FileCheck --check-prefix=RVA20S64 %s
+; RUN: llc -mtriple=riscv64 -mattr=+rva22u64 %s -o - | FileCheck --check-prefix=RVA22U64 %s
+; RUN: llc -mtriple=riscv64 -mattr=+rva22s64 %s -o - | FileCheck --check-prefix=RVA22S64 %s
+; RUN: llc -mtriple=riscv64 -mattr=+rva23u64 %s -o - | FileCheck --check-prefix=RVA23U64 %s
+; RUN: llc -mtriple=riscv64 -mattr=+rva23s64 %s -o - | FileCheck --check-prefix=RVA23S64 %s
+; RUN: llc -mtriple=riscv64 -mattr=+rvb23u64 %s -o - | FileCheck --check-prefix=RVB23U64 %s
+; RUN: llc -mtriple=riscv64 -mattr=+rvb23s64 %s -o - | FileCheck --check-prefix=RVB23S64 %s
+; RUN: llc -mtriple=riscv32 -mattr=+rvm23u32 %s -o - | FileCheck --check-prefix=RVM23U32 %s
+
 ; CHECK: .attribute 4, 16
 
 ; RV32M: .attribute 5, "rv32i2p1_m2p0"
@@ -512,6 +525,18 @@
 ; RV64SUPM: .attribute 5, "rv64i2p1_supm0p8"
 ; RV64SSQOSID: .attribute 5, "rv64i2p1_ssqosid1p0"
 
+; RVI20U32: .attribute 5, "rv32i2p1"
+; RVI20U64: .attribute 5, "rv64i2p1"
+; RVA20U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicsr2p0_za128rs1p0"
+; RVA20S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicsr2p0_zifencei2p0_za128rs1p0_ssccptr1p0_sstvala1p0_sstvecd1p0_svade1p0_svbare1p0"
+; RVA22U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicsr2p0_zihintpause2p0_zihpm2p0_za64rs1p0_zfhmin1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0"
+; RVA22S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicsr2p0_zifencei2p0_zihintpause2p0_zihpm2p0_za64rs1p0_zfhmin1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_ssccptr1p0_sscounterenw1p0_sstvala1p0_sstvecd1p0_svade1p0_svbare1p0_svinval1p0_svpbmt1p0"
+; RVA23U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_v1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_za64rs1p0_zawrs1p0_zfa1p0_zfhmin1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_zvbb1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfhmin1p0_zvkb1p0_zvkt1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0"
+; RVA23S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_v1p0_h1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zifencei2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_za64rs1p0_zawrs1p0_zfa1p0_zfhmin1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_zvbb1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfhmin1p0_zvkb1p0_zvkt1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_shcounterenw1p0_shgatpa1p0_shtvala1p0_shvsatpa1p0_shvstvala1p0_shvstvecd1p0_ssccptr1p0_sscofpmf1p0_sscounterenw1p0_ssnpm0p8_ssstateen1p0_sstc1p0_sstvala1p0_sstvecd1p0_ssu64xl1p0_svade1p0_svbare1p0_svinval1p0_svnapot1p0_svpbmt1p0"
+; RVB23U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_za64rs1p0_zawrs1p0_zfa1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0"
+; RVB23S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zifencei2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_za64rs1p0_zawrs1p0_zfa1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_ssccptr1p0_sscofpmf1p0_sscounterenw1p0_sstc1p0_sstvala1p0_sstvecd1p0_ssu64xl1p0_svade1p0_svbare1p0_svinval1p0_svnapot1p0_svpbmt1p0"
+; RVM23U32: .attribute 5, "rv32i2p1_m2p0_zicbop1p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zimop1p0_zca1p0_zcb1p0_zce1p0_zcmop1p0_zcmp1p0_zcmt1p0_zba1p0_zbb1p0_zbs1p0"
+
 define i32 @addi(i32 %a) {
   %1 = add i32 %a, 1
   ret i32 %1
diff --git a/llvm/test/CodeGen/RISCV/bfloat-convert.ll b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
index 9e2b0b5c3cbb..770dcccee882 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-convert.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
@@ -456,92 +456,80 @@ define i64 @fcvt_l_bf16(bfloat %a) nounwind {
 define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind {
 ; RV32IZFBFMIN-LABEL: fcvt_l_bf16_sat:
 ; RV32IZFBFMIN:       # %bb.0: # %start
-; RV32IZFBFMIN-NEXT:    addi sp, sp, -32
-; RV32IZFBFMIN-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32IZFBFMIN-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32IZFBFMIN-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32IZFBFMIN-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IZFBFMIN-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32IZFBFMIN-NEXT:    fsw fs0, 8(sp) # 4-byte Folded Spill
-; RV32IZFBFMIN-NEXT:    lui a0, %hi(.LCPI10_0)
-; RV32IZFBFMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a0)
+; RV32IZFBFMIN-NEXT:    addi sp, sp, -16
+; RV32IZFBFMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IZFBFMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32IZFBFMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fs0, fa0
-; RV32IZFBFMIN-NEXT:    flt.s s0, fa5, fs0
-; RV32IZFBFMIN-NEXT:    neg s1, s0
 ; RV32IZFBFMIN-NEXT:    lui a0, 913408
 ; RV32IZFBFMIN-NEXT:    fmv.w.x fa5, a0
-; RV32IZFBFMIN-NEXT:    fle.s s2, fa5, fs0
-; RV32IZFBFMIN-NEXT:    neg s3, s2
+; RV32IZFBFMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFBFMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFBFMIN-NEXT:    call __fixsfdi
-; RV32IZFBFMIN-NEXT:    and a0, s3, a0
-; RV32IZFBFMIN-NEXT:    or a0, s1, a0
-; RV32IZFBFMIN-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFBFMIN-NEXT:    neg a2, a2
 ; RV32IZFBFMIN-NEXT:    lui a4, 524288
-; RV32IZFBFMIN-NEXT:    lui a3, 524288
-; RV32IZFBFMIN-NEXT:    beqz s2, .LBB10_2
+; RV32IZFBFMIN-NEXT:    lui a2, 524288
+; RV32IZFBFMIN-NEXT:    beqz s0, .LBB10_2
 ; RV32IZFBFMIN-NEXT:  # %bb.1: # %start
-; RV32IZFBFMIN-NEXT:    mv a3, a1
+; RV32IZFBFMIN-NEXT:    mv a2, a1
 ; RV32IZFBFMIN-NEXT:  .LBB10_2: # %start
-; RV32IZFBFMIN-NEXT:    and a0, a2, a0
-; RV32IZFBFMIN-NEXT:    beqz s0, .LBB10_4
+; RV32IZFBFMIN-NEXT:    lui a1, %hi(.LCPI10_0)
+; RV32IZFBFMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
+; RV32IZFBFMIN-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFBFMIN-NEXT:    beqz a3, .LBB10_4
 ; RV32IZFBFMIN-NEXT:  # %bb.3:
-; RV32IZFBFMIN-NEXT:    addi a3, a4, -1
+; RV32IZFBFMIN-NEXT:    addi a2, a4, -1
 ; RV32IZFBFMIN-NEXT:  .LBB10_4: # %start
-; RV32IZFBFMIN-NEXT:    and a1, a2, a3
-; RV32IZFBFMIN-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32IZFBFMIN-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32IZFBFMIN-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32IZFBFMIN-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32IZFBFMIN-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32IZFBFMIN-NEXT:    flw fs0, 8(sp) # 4-byte Folded Reload
-; RV32IZFBFMIN-NEXT:    addi sp, sp, 32
+; RV32IZFBFMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFBFMIN-NEXT:    neg a4, a1
+; RV32IZFBFMIN-NEXT:    and a1, a4, a2
+; RV32IZFBFMIN-NEXT:    neg a2, a3
+; RV32IZFBFMIN-NEXT:    neg a3, s0
+; RV32IZFBFMIN-NEXT:    and a0, a3, a0
+; RV32IZFBFMIN-NEXT:    or a0, a2, a0
+; RV32IZFBFMIN-NEXT:    and a0, a4, a0
+; RV32IZFBFMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IZFBFMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32IZFBFMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IZFBFMIN-NEXT:    addi sp, sp, 16
 ; RV32IZFBFMIN-NEXT:    ret
 ;
 ; R32IDZFBFMIN-LABEL: fcvt_l_bf16_sat:
 ; R32IDZFBFMIN:       # %bb.0: # %start
-; R32IDZFBFMIN-NEXT:    addi sp, sp, -32
-; R32IDZFBFMIN-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; R32IDZFBFMIN-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; R32IDZFBFMIN-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; R32IDZFBFMIN-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; R32IDZFBFMIN-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; R32IDZFBFMIN-NEXT:    addi sp, sp, -16
+; R32IDZFBFMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; R32IDZFBFMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; R32IDZFBFMIN-NEXT:    fsd fs0, 0(sp) # 8-byte Folded Spill
-; R32IDZFBFMIN-NEXT:    lui a0, %hi(.LCPI10_0)
-; R32IDZFBFMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a0)
 ; R32IDZFBFMIN-NEXT:    fcvt.s.bf16 fs0, fa0
-; R32IDZFBFMIN-NEXT:    flt.s s0, fa5, fs0
-; R32IDZFBFMIN-NEXT:    neg s1, s0
 ; R32IDZFBFMIN-NEXT:    lui a0, 913408
 ; R32IDZFBFMIN-NEXT:    fmv.w.x fa5, a0
-; R32IDZFBFMIN-NEXT:    fle.s s2, fa5, fs0
-; R32IDZFBFMIN-NEXT:    neg s3, s2
+; R32IDZFBFMIN-NEXT:    fle.s s0, fa5, fs0
 ; R32IDZFBFMIN-NEXT:    fmv.s fa0, fs0
 ; R32IDZFBFMIN-NEXT:    call __fixsfdi
-; R32IDZFBFMIN-NEXT:    and a0, s3, a0
-; R32IDZFBFMIN-NEXT:    or a0, s1, a0
-; R32IDZFBFMIN-NEXT:    feq.s a2, fs0, fs0
-; R32IDZFBFMIN-NEXT:    neg a2, a2
 ; R32IDZFBFMIN-NEXT:    lui a4, 524288
-; R32IDZFBFMIN-NEXT:    lui a3, 524288
-; R32IDZFBFMIN-NEXT:    beqz s2, .LBB10_2
+; R32IDZFBFMIN-NEXT:    lui a2, 524288
+; R32IDZFBFMIN-NEXT:    beqz s0, .LBB10_2
 ; R32IDZFBFMIN-NEXT:  # %bb.1: # %start
-; R32IDZFBFMIN-NEXT:    mv a3, a1
+; R32IDZFBFMIN-NEXT:    mv a2, a1
 ; R32IDZFBFMIN-NEXT:  .LBB10_2: # %start
-; R32IDZFBFMIN-NEXT:    and a0, a2, a0
-; R32IDZFBFMIN-NEXT:    beqz s0, .LBB10_4
+; R32IDZFBFMIN-NEXT:    lui a1, %hi(.LCPI10_0)
+; R32IDZFBFMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
+; R32IDZFBFMIN-NEXT:    flt.s a3, fa5, fs0
+; R32IDZFBFMIN-NEXT:    beqz a3, .LBB10_4
 ; R32IDZFBFMIN-NEXT:  # %bb.3:
-; R32IDZFBFMIN-NEXT:    addi a3, a4, -1
+; R32IDZFBFMIN-NEXT:    addi a2, a4, -1
 ; R32IDZFBFMIN-NEXT:  .LBB10_4: # %start
-; R32IDZFBFMIN-NEXT:    and a1, a2, a3
-; R32IDZFBFMIN-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; R32IDZFBFMIN-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; R32IDZFBFMIN-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; R32IDZFBFMIN-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; R32IDZFBFMIN-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; R32IDZFBFMIN-NEXT:    feq.s a1, fs0, fs0
+; R32IDZFBFMIN-NEXT:    neg a4, a1
+; R32IDZFBFMIN-NEXT:    and a1, a4, a2
+; R32IDZFBFMIN-NEXT:    neg a2, a3
+; R32IDZFBFMIN-NEXT:    neg a3, s0
+; R32IDZFBFMIN-NEXT:    and a0, a3, a0
+; R32IDZFBFMIN-NEXT:    or a0, a2, a0
+; R32IDZFBFMIN-NEXT:    and a0, a4, a0
+; R32IDZFBFMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; R32IDZFBFMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; R32IDZFBFMIN-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
-; R32IDZFBFMIN-NEXT:    addi sp, sp, 32
+; R32IDZFBFMIN-NEXT:    addi sp, sp, 16
 ; R32IDZFBFMIN-NEXT:    ret
 ;
 ; RV32ID-LABEL: fcvt_l_bf16_sat:
diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll
index c147d6ec6d9b..6024a29da33d 100644
--- a/llvm/test/CodeGen/RISCV/double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/double-convert.ll
@@ -692,28 +692,27 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a3, 524288
-; RV32IFD-NEXT:    li a4, 1
+; RV32IFD-NEXT:    lui a4, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
-; RV32IFD-NEXT:    bne s0, a4, .LBB12_2
+; RV32IFD-NEXT:    beqz s0, .LBB12_2
 ; RV32IFD-NEXT:  # %bb.1: # %start
 ; RV32IFD-NEXT:    mv a2, a1
 ; RV32IFD-NEXT:  .LBB12_2: # %start
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI12_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI12_1)(a1)
-; RV32IFD-NEXT:    flt.d a4, fa5, fs0
-; RV32IFD-NEXT:    beqz a4, .LBB12_4
+; RV32IFD-NEXT:    flt.d a3, fa5, fs0
+; RV32IFD-NEXT:    beqz a3, .LBB12_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a3, -1
+; RV32IFD-NEXT:    addi a2, a4, -1
 ; RV32IFD-NEXT:  .LBB12_4: # %start
 ; RV32IFD-NEXT:    feq.d a1, fs0, fs0
-; RV32IFD-NEXT:    neg a3, a1
-; RV32IFD-NEXT:    and a1, a3, a2
-; RV32IFD-NEXT:    neg a2, a4
-; RV32IFD-NEXT:    neg a4, s0
-; RV32IFD-NEXT:    and a0, a4, a0
-; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    neg a4, a1
+; RV32IFD-NEXT:    and a1, a4, a2
+; RV32IFD-NEXT:    neg a2, a3
+; RV32IFD-NEXT:    neg a3, s0
 ; RV32IFD-NEXT:    and a0, a3, a0
+; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -790,33 +789,32 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s6, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
 ; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lui a3, 278016
+; RV32I-NEXT:    addi a3, a3, -1
+; RV32I-NEXT:    li a2, -1
+; RV32I-NEXT:    call __gtdf2
+; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    lui a3, 802304
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s0
 ; RV32I-NEXT:    li a2, 0
 ; RV32I-NEXT:    call __gedf2
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    mv a1, s0
 ; RV32I-NEXT:    call __fixdfdi
-; RV32I-NEXT:    mv s3, a0
-; RV32I-NEXT:    mv s4, a1
-; RV32I-NEXT:    lui s6, 524288
-; RV32I-NEXT:    bgez s2, .LBB12_2
+; RV32I-NEXT:    mv s4, a0
+; RV32I-NEXT:    mv s5, a1
+; RV32I-NEXT:    lui a0, 524288
+; RV32I-NEXT:    bgez s3, .LBB12_2
 ; RV32I-NEXT:  # %bb.1: # %start
-; RV32I-NEXT:    lui s4, 524288
+; RV32I-NEXT:    lui s5, 524288
 ; RV32I-NEXT:  .LBB12_2: # %start
-; RV32I-NEXT:    lui a3, 278016
-; RV32I-NEXT:    addi a3, a3, -1
-; RV32I-NEXT:    li a2, -1
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    call __gtdf2
-; RV32I-NEXT:    mv s5, a0
-; RV32I-NEXT:    blez a0, .LBB12_4
+; RV32I-NEXT:    blez s2, .LBB12_4
 ; RV32I-NEXT:  # %bb.3: # %start
-; RV32I-NEXT:    addi s4, s6, -1
+; RV32I-NEXT:    addi s5, a0, -1
 ; RV32I-NEXT:  .LBB12_4: # %start
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    mv a1, s0
@@ -825,11 +823,11 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32I-NEXT:    call __unorddf2
 ; RV32I-NEXT:    snez a0, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a1, a0, s4
-; RV32I-NEXT:    slti a2, s2, 0
+; RV32I-NEXT:    and a1, a0, s5
+; RV32I-NEXT:    slti a2, s3, 0
 ; RV32I-NEXT:    addi a2, a2, -1
-; RV32I-NEXT:    and a2, a2, s3
-; RV32I-NEXT:    sgtz a3, s5
+; RV32I-NEXT:    and a2, a2, s4
+; RV32I-NEXT:    sgtz a3, s2
 ; RV32I-NEXT:    neg a3, a3
 ; RV32I-NEXT:    or a2, a3, a2
 ; RV32I-NEXT:    and a0, a0, a2
@@ -840,7 +838,6 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s5, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s6, 0(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
@@ -949,22 +946,23 @@ define i64 @fcvt_lu_d_sat(double %a) nounwind {
 ; RV32IFD-NEXT:    addi sp, sp, -16
 ; RV32IFD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IFD-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT:    lui a0, %hi(.LCPI14_0)
-; RV32IFD-NEXT:    fld fa5, %lo(.LCPI14_0)(a0)
-; RV32IFD-NEXT:    flt.d a0, fa5, fa0
-; RV32IFD-NEXT:    neg s0, a0
+; RV32IFD-NEXT:    fsd fs0, 0(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fcvt.d.w fa5, zero
 ; RV32IFD-NEXT:    fle.d a0, fa5, fa0
-; RV32IFD-NEXT:    neg s1, a0
+; RV32IFD-NEXT:    neg s0, a0
 ; RV32IFD-NEXT:    call __fixunsdfdi
-; RV32IFD-NEXT:    and a0, s1, a0
-; RV32IFD-NEXT:    or a0, s0, a0
-; RV32IFD-NEXT:    and a1, s1, a1
-; RV32IFD-NEXT:    or a1, s0, a1
+; RV32IFD-NEXT:    lui a2, %hi(.LCPI14_0)
+; RV32IFD-NEXT:    fld fa5, %lo(.LCPI14_0)(a2)
+; RV32IFD-NEXT:    and a0, s0, a0
+; RV32IFD-NEXT:    flt.d a2, fa5, fs0
+; RV32IFD-NEXT:    neg a2, a2
+; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    and a1, s0, a1
+; RV32IFD-NEXT:    or a1, a2, a1
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
 ;
@@ -983,27 +981,24 @@ define i64 @fcvt_lu_d_sat(double %a) nounwind {
 ; RV32IZFINXZDINX-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFINXZDINX-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IZFINXZDINX-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
 ; RV32IZFINXZDINX-NEXT:    mv s1, a1
-; RV32IZFINXZDINX-NEXT:    fcvt.d.w a2, zero
 ; RV32IZFINXZDINX-NEXT:    mv s0, a0
-; RV32IZFINXZDINX-NEXT:    fle.d a0, a2, s0
-; RV32IZFINXZDINX-NEXT:    neg s2, a0
-; RV32IZFINXZDINX-NEXT:    mv a0, s0
 ; RV32IZFINXZDINX-NEXT:    call __fixunsdfdi
-; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI14_0)
-; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI14_0+4)(a2)
-; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI14_0)(a2)
-; RV32IZFINXZDINX-NEXT:    and a0, s2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a2, a2, s0
+; RV32IZFINXZDINX-NEXT:    fcvt.d.w a2, zero
+; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI14_0)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI14_0+4)(a4)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI14_0)(a4)
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    and a1, s2, a1
-; RV32IZFINXZDINX-NEXT:    or a1, a2, a1
+; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    flt.d a3, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a3, a3
+; RV32IZFINXZDINX-NEXT:    or a0, a3, a0
+; RV32IZFINXZDINX-NEXT:    and a1, a2, a1
+; RV32IZFINXZDINX-NEXT:    or a1, a3, a1
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    addi sp, sp, 16
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
index f1c56b320b76..927eee2e9e54 100644
--- a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
@@ -53,28 +53,27 @@ define i64 @test_floor_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a3, 524288
-; RV32IFD-NEXT:    li a4, 1
+; RV32IFD-NEXT:    lui a4, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
-; RV32IFD-NEXT:    bne s0, a4, .LBB1_2
+; RV32IFD-NEXT:    beqz s0, .LBB1_2
 ; RV32IFD-NEXT:  # %bb.1:
 ; RV32IFD-NEXT:    mv a2, a1
 ; RV32IFD-NEXT:  .LBB1_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI1_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI1_1)(a1)
-; RV32IFD-NEXT:    flt.d a4, fa5, fs0
-; RV32IFD-NEXT:    beqz a4, .LBB1_4
+; RV32IFD-NEXT:    flt.d a3, fa5, fs0
+; RV32IFD-NEXT:    beqz a3, .LBB1_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a3, -1
+; RV32IFD-NEXT:    addi a2, a4, -1
 ; RV32IFD-NEXT:  .LBB1_4:
 ; RV32IFD-NEXT:    feq.d a1, fs0, fs0
-; RV32IFD-NEXT:    neg a3, a1
-; RV32IFD-NEXT:    and a1, a3, a2
-; RV32IFD-NEXT:    neg a2, a4
-; RV32IFD-NEXT:    neg a4, s0
-; RV32IFD-NEXT:    and a0, a4, a0
-; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    neg a4, a1
+; RV32IFD-NEXT:    and a1, a4, a2
+; RV32IFD-NEXT:    neg a2, a3
+; RV32IFD-NEXT:    neg a3, s0
 ; RV32IFD-NEXT:    and a0, a3, a0
+; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -299,28 +298,27 @@ define i64 @test_ceil_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a3, 524288
-; RV32IFD-NEXT:    li a4, 1
+; RV32IFD-NEXT:    lui a4, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
-; RV32IFD-NEXT:    bne s0, a4, .LBB5_2
+; RV32IFD-NEXT:    beqz s0, .LBB5_2
 ; RV32IFD-NEXT:  # %bb.1:
 ; RV32IFD-NEXT:    mv a2, a1
 ; RV32IFD-NEXT:  .LBB5_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI5_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI5_1)(a1)
-; RV32IFD-NEXT:    flt.d a4, fa5, fs0
-; RV32IFD-NEXT:    beqz a4, .LBB5_4
+; RV32IFD-NEXT:    flt.d a3, fa5, fs0
+; RV32IFD-NEXT:    beqz a3, .LBB5_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a3, -1
+; RV32IFD-NEXT:    addi a2, a4, -1
 ; RV32IFD-NEXT:  .LBB5_4:
 ; RV32IFD-NEXT:    feq.d a1, fs0, fs0
-; RV32IFD-NEXT:    neg a3, a1
-; RV32IFD-NEXT:    and a1, a3, a2
-; RV32IFD-NEXT:    neg a2, a4
-; RV32IFD-NEXT:    neg a4, s0
-; RV32IFD-NEXT:    and a0, a4, a0
-; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    neg a4, a1
+; RV32IFD-NEXT:    and a1, a4, a2
+; RV32IFD-NEXT:    neg a2, a3
+; RV32IFD-NEXT:    neg a3, s0
 ; RV32IFD-NEXT:    and a0, a3, a0
+; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -545,28 +543,27 @@ define i64 @test_trunc_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a3, 524288
-; RV32IFD-NEXT:    li a4, 1
+; RV32IFD-NEXT:    lui a4, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
-; RV32IFD-NEXT:    bne s0, a4, .LBB9_2
+; RV32IFD-NEXT:    beqz s0, .LBB9_2
 ; RV32IFD-NEXT:  # %bb.1:
 ; RV32IFD-NEXT:    mv a2, a1
 ; RV32IFD-NEXT:  .LBB9_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI9_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI9_1)(a1)
-; RV32IFD-NEXT:    flt.d a4, fa5, fs0
-; RV32IFD-NEXT:    beqz a4, .LBB9_4
+; RV32IFD-NEXT:    flt.d a3, fa5, fs0
+; RV32IFD-NEXT:    beqz a3, .LBB9_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a3, -1
+; RV32IFD-NEXT:    addi a2, a4, -1
 ; RV32IFD-NEXT:  .LBB9_4:
 ; RV32IFD-NEXT:    feq.d a1, fs0, fs0
-; RV32IFD-NEXT:    neg a3, a1
-; RV32IFD-NEXT:    and a1, a3, a2
-; RV32IFD-NEXT:    neg a2, a4
-; RV32IFD-NEXT:    neg a4, s0
-; RV32IFD-NEXT:    and a0, a4, a0
-; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    neg a4, a1
+; RV32IFD-NEXT:    and a1, a4, a2
+; RV32IFD-NEXT:    neg a2, a3
+; RV32IFD-NEXT:    neg a3, s0
 ; RV32IFD-NEXT:    and a0, a3, a0
+; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -791,28 +788,27 @@ define i64 @test_round_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a3, 524288
-; RV32IFD-NEXT:    li a4, 1
+; RV32IFD-NEXT:    lui a4, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
-; RV32IFD-NEXT:    bne s0, a4, .LBB13_2
+; RV32IFD-NEXT:    beqz s0, .LBB13_2
 ; RV32IFD-NEXT:  # %bb.1:
 ; RV32IFD-NEXT:    mv a2, a1
 ; RV32IFD-NEXT:  .LBB13_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI13_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI13_1)(a1)
-; RV32IFD-NEXT:    flt.d a4, fa5, fs0
-; RV32IFD-NEXT:    beqz a4, .LBB13_4
+; RV32IFD-NEXT:    flt.d a3, fa5, fs0
+; RV32IFD-NEXT:    beqz a3, .LBB13_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a3, -1
+; RV32IFD-NEXT:    addi a2, a4, -1
 ; RV32IFD-NEXT:  .LBB13_4:
 ; RV32IFD-NEXT:    feq.d a1, fs0, fs0
-; RV32IFD-NEXT:    neg a3, a1
-; RV32IFD-NEXT:    and a1, a3, a2
-; RV32IFD-NEXT:    neg a2, a4
-; RV32IFD-NEXT:    neg a4, s0
-; RV32IFD-NEXT:    and a0, a4, a0
-; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    neg a4, a1
+; RV32IFD-NEXT:    and a1, a4, a2
+; RV32IFD-NEXT:    neg a2, a3
+; RV32IFD-NEXT:    neg a3, s0
 ; RV32IFD-NEXT:    and a0, a3, a0
+; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -1037,28 +1033,27 @@ define i64 @test_roundeven_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a3, 524288
-; RV32IFD-NEXT:    li a4, 1
+; RV32IFD-NEXT:    lui a4, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
-; RV32IFD-NEXT:    bne s0, a4, .LBB17_2
+; RV32IFD-NEXT:    beqz s0, .LBB17_2
 ; RV32IFD-NEXT:  # %bb.1:
 ; RV32IFD-NEXT:    mv a2, a1
 ; RV32IFD-NEXT:  .LBB17_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI17_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI17_1)(a1)
-; RV32IFD-NEXT:    flt.d a4, fa5, fs0
-; RV32IFD-NEXT:    beqz a4, .LBB17_4
+; RV32IFD-NEXT:    flt.d a3, fa5, fs0
+; RV32IFD-NEXT:    beqz a3, .LBB17_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a3, -1
+; RV32IFD-NEXT:    addi a2, a4, -1
 ; RV32IFD-NEXT:  .LBB17_4:
 ; RV32IFD-NEXT:    feq.d a1, fs0, fs0
-; RV32IFD-NEXT:    neg a3, a1
-; RV32IFD-NEXT:    and a1, a3, a2
-; RV32IFD-NEXT:    neg a2, a4
-; RV32IFD-NEXT:    neg a4, s0
-; RV32IFD-NEXT:    and a0, a4, a0
-; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    neg a4, a1
+; RV32IFD-NEXT:    and a1, a4, a2
+; RV32IFD-NEXT:    neg a2, a3
+; RV32IFD-NEXT:    neg a3, s0
 ; RV32IFD-NEXT:    and a0, a3, a0
+; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -1283,28 +1278,27 @@ define i64 @test_rint_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a3, 524288
-; RV32IFD-NEXT:    li a4, 1
+; RV32IFD-NEXT:    lui a4, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
-; RV32IFD-NEXT:    bne s0, a4, .LBB21_2
+; RV32IFD-NEXT:    beqz s0, .LBB21_2
 ; RV32IFD-NEXT:  # %bb.1:
 ; RV32IFD-NEXT:    mv a2, a1
 ; RV32IFD-NEXT:  .LBB21_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI21_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI21_1)(a1)
-; RV32IFD-NEXT:    flt.d a4, fa5, fs0
-; RV32IFD-NEXT:    beqz a4, .LBB21_4
+; RV32IFD-NEXT:    flt.d a3, fa5, fs0
+; RV32IFD-NEXT:    beqz a3, .LBB21_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a3, -1
+; RV32IFD-NEXT:    addi a2, a4, -1
 ; RV32IFD-NEXT:  .LBB21_4:
 ; RV32IFD-NEXT:    feq.d a1, fs0, fs0
-; RV32IFD-NEXT:    neg a3, a1
-; RV32IFD-NEXT:    and a1, a3, a2
-; RV32IFD-NEXT:    neg a2, a4
-; RV32IFD-NEXT:    neg a4, s0
-; RV32IFD-NEXT:    and a0, a4, a0
-; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    neg a4, a1
+; RV32IFD-NEXT:    and a1, a4, a2
+; RV32IFD-NEXT:    neg a2, a3
+; RV32IFD-NEXT:    neg a3, s0
 ; RV32IFD-NEXT:    and a0, a3, a0
+; RV32IFD-NEXT:    or a0, a2, a0
+; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll
index 653b64ec7304..7eabd3f5f227 100644
--- a/llvm/test/CodeGen/RISCV/float-convert.ll
+++ b/llvm/test/CodeGen/RISCV/float-convert.ll
@@ -275,26 +275,24 @@ define i32 @fcvt_wu_s_sat(float %a) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lui a1, 325632
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    call __gtsf2
-; RV32I-NEXT:    sgtz a0, a0
-; RV32I-NEXT:    neg s1, a0
-; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __gesf2
 ; RV32I-NEXT:    slti a0, a0, 0
-; RV32I-NEXT:    addi s2, a0, -1
+; RV32I-NEXT:    addi s1, a0, -1
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __fixunssfsi
-; RV32I-NEXT:    and a0, s2, a0
-; RV32I-NEXT:    or a0, s1, a0
+; RV32I-NEXT:    and s1, s1, a0
+; RV32I-NEXT:    lui a1, 325632
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    call __gtsf2
+; RV32I-NEXT:    sgtz a0, a0
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    or a0, a0, s1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
@@ -618,38 +616,36 @@ define i64 @fcvt_l_s_sat(float %a) nounwind {
 ; RV32IF-NEXT:    addi sp, sp, -16
 ; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IF-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    fmv.s fs0, fa0
 ; RV32IF-NEXT:    lui a0, 913408
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
 ; RV32IF-NEXT:    fle.s s0, fa5, fa0
-; RV32IF-NEXT:    neg s1, s0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a2, %hi(.LCPI12_0)
-; RV32IF-NEXT:    flw fa5, %lo(.LCPI12_0)(a2)
-; RV32IF-NEXT:    and a0, s1, a0
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    feq.s a2, fs0, fs0
-; RV32IF-NEXT:    neg a2, a2
-; RV32IF-NEXT:    lui a5, 524288
 ; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB12_2
 ; RV32IF-NEXT:  # %bb.1: # %start
-; RV32IF-NEXT:    mv a4, a1
+; RV32IF-NEXT:    mv a2, a1
 ; RV32IF-NEXT:  .LBB12_2: # %start
-; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    lui a1, %hi(.LCPI12_0)
+; RV32IF-NEXT:    flw fa5, %lo(.LCPI12_0)(a1)
+; RV32IF-NEXT:    flt.s a3, fa5, fs0
 ; RV32IF-NEXT:    beqz a3, .LBB12_4
 ; RV32IF-NEXT:  # %bb.3:
-; RV32IF-NEXT:    addi a4, a5, -1
+; RV32IF-NEXT:    addi a2, a4, -1
 ; RV32IF-NEXT:  .LBB12_4: # %start
-; RV32IF-NEXT:    and a1, a2, a4
+; RV32IF-NEXT:    feq.s a1, fs0, fs0
+; RV32IF-NEXT:    neg a4, a1
+; RV32IF-NEXT:    and a1, a4, a2
+; RV32IF-NEXT:    neg a2, s0
+; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    neg a2, a3
+; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    and a0, a4, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
 ;
@@ -867,22 +863,23 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind {
 ; RV32IF-NEXT:    addi sp, sp, -16
 ; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    lui a0, %hi(.LCPI14_0)
-; RV32IF-NEXT:    flw fa5, %lo(.LCPI14_0)(a0)
-; RV32IF-NEXT:    flt.s a0, fa5, fa0
-; RV32IF-NEXT:    neg s0, a0
+; RV32IF-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
+; RV32IF-NEXT:    fmv.s fs0, fa0
 ; RV32IF-NEXT:    fmv.w.x fa5, zero
 ; RV32IF-NEXT:    fle.s a0, fa5, fa0
-; RV32IF-NEXT:    neg s1, a0
+; RV32IF-NEXT:    neg s0, a0
 ; RV32IF-NEXT:    call __fixunssfdi
-; RV32IF-NEXT:    and a0, s1, a0
-; RV32IF-NEXT:    or a0, s0, a0
-; RV32IF-NEXT:    and a1, s1, a1
-; RV32IF-NEXT:    or a1, s0, a1
+; RV32IF-NEXT:    lui a2, %hi(.LCPI14_0)
+; RV32IF-NEXT:    flw fa5, %lo(.LCPI14_0)(a2)
+; RV32IF-NEXT:    and a0, s0, a0
+; RV32IF-NEXT:    flt.s a2, fa5, fs0
+; RV32IF-NEXT:    neg a2, a2
+; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    and a1, s0, a1
+; RV32IF-NEXT:    or a1, a2, a1
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
 ;
@@ -901,17 +898,19 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind {
 ; RV32IZFINX-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFINX-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IZFINX-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFINX-NEXT:    lui a1, %hi(.LCPI14_0)
-; RV32IZFINX-NEXT:    lw a1, %lo(.LCPI14_0)(a1)
-; RV32IZFINX-NEXT:    flt.s a1, a1, a0
-; RV32IZFINX-NEXT:    neg s0, a1
-; RV32IZFINX-NEXT:    fle.s a1, zero, a0
-; RV32IZFINX-NEXT:    neg s1, a1
+; RV32IZFINX-NEXT:    mv s0, a0
+; RV32IZFINX-NEXT:    fle.s a0, zero, a0
+; RV32IZFINX-NEXT:    neg s1, a0
+; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixunssfdi
+; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI14_0)
+; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI14_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s1, a0
-; RV32IZFINX-NEXT:    or a0, s0, a0
+; RV32IZFINX-NEXT:    flt.s a2, a2, s0
+; RV32IZFINX-NEXT:    neg a2, a2
+; RV32IZFINX-NEXT:    or a0, a2, a0
 ; RV32IZFINX-NEXT:    and a1, s1, a1
-; RV32IZFINX-NEXT:    or a1, s0, a1
+; RV32IZFINX-NEXT:    or a1, a2, a1
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -929,33 +928,36 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind {
 ;
 ; RV32I-LABEL: fcvt_lu_s_sat:
 ; RV32I:       # %bb.0: # %start
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lui a1, 391168
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    call __gtsf2
-; RV32I-NEXT:    sgtz a0, a0
-; RV32I-NEXT:    neg s1, a0
-; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __gesf2
 ; RV32I-NEXT:    slti a0, a0, 0
 ; RV32I-NEXT:    addi s2, a0, -1
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __fixunssfdi
-; RV32I-NEXT:    and a0, s2, a0
-; RV32I-NEXT:    or a0, s1, a0
-; RV32I-NEXT:    and a1, s2, a1
-; RV32I-NEXT:    or a1, s1, a1
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    and s3, s2, a0
+; RV32I-NEXT:    lui a1, 391168
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    call __gtsf2
+; RV32I-NEXT:    sgtz a0, a0
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    or a0, a1, s3
+; RV32I-NEXT:    and a2, s2, s1
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: fcvt_lu_s_sat:
@@ -2089,26 +2091,24 @@ define zeroext i32 @fcvt_wu_s_sat_zext(float %a) nounwind {
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lui a1, 325632
-; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    call __gtsf2
-; RV32I-NEXT:    sgtz a0, a0
-; RV32I-NEXT:    neg s1, a0
-; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __gesf2
 ; RV32I-NEXT:    slti a0, a0, 0
-; RV32I-NEXT:    addi s2, a0, -1
+; RV32I-NEXT:    addi s1, a0, -1
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __fixunssfsi
-; RV32I-NEXT:    and a0, s2, a0
-; RV32I-NEXT:    or a0, s1, a0
+; RV32I-NEXT:    and s1, s1, a0
+; RV32I-NEXT:    lui a1, 325632
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    call __gtsf2
+; RV32I-NEXT:    sgtz a0, a0
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    or a0, a0, s1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
index 4f747c278da0..5e99c7eb9056 100644
--- a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
@@ -37,8 +37,7 @@ define i64 @test_floor_si64(float %x) nounwind {
 ; RV32IF-NEXT:    addi sp, sp, -16
 ; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IF-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    fmv.s fs0, fa0
 ; RV32IF-NEXT:    lui a0, 307200
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
@@ -53,33 +52,32 @@ define i64 @test_floor_si64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a0, 913408
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
-; RV32IF-NEXT:    neg s1, s0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a2, %hi(.LCPI1_0)
-; RV32IF-NEXT:    flw fa5, %lo(.LCPI1_0)(a2)
-; RV32IF-NEXT:    and a0, s1, a0
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    feq.s a2, fs0, fs0
-; RV32IF-NEXT:    neg a2, a2
-; RV32IF-NEXT:    lui a5, 524288
 ; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB1_4
 ; RV32IF-NEXT:  # %bb.3:
-; RV32IF-NEXT:    mv a4, a1
+; RV32IF-NEXT:    mv a2, a1
 ; RV32IF-NEXT:  .LBB1_4:
-; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    lui a1, %hi(.LCPI1_0)
+; RV32IF-NEXT:    flw fa5, %lo(.LCPI1_0)(a1)
+; RV32IF-NEXT:    flt.s a3, fa5, fs0
 ; RV32IF-NEXT:    beqz a3, .LBB1_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a4, a5, -1
+; RV32IF-NEXT:    addi a2, a4, -1
 ; RV32IF-NEXT:  .LBB1_6:
-; RV32IF-NEXT:    and a1, a2, a4
+; RV32IF-NEXT:    feq.s a1, fs0, fs0
+; RV32IF-NEXT:    neg a4, a1
+; RV32IF-NEXT:    and a1, a4, a2
+; RV32IF-NEXT:    neg a2, s0
+; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    neg a2, a3
+; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    and a0, a4, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
 ;
@@ -117,23 +115,23 @@ define i64 @test_floor_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI1_0)
 ; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI1_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s2, a0
-; RV32IZFINX-NEXT:    flt.s a3, a2, s0
-; RV32IZFINX-NEXT:    neg a2, a3
+; RV32IZFINX-NEXT:    flt.s a4, a2, s0
+; RV32IZFINX-NEXT:    neg a2, a4
 ; RV32IZFINX-NEXT:    or a0, a2, a0
 ; RV32IZFINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    lui a5, 524288
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB1_4
 ; RV32IZFINX-NEXT:  # %bb.3:
-; RV32IZFINX-NEXT:    mv a4, a1
+; RV32IZFINX-NEXT:    mv a3, a1
 ; RV32IZFINX-NEXT:  .LBB1_4:
 ; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    beqz a3, .LBB1_6
+; RV32IZFINX-NEXT:    beqz a4, .LBB1_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a4, a5, -1
+; RV32IZFINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINX-NEXT:  .LBB1_6:
-; RV32IZFINX-NEXT:    and a1, a2, a4
+; RV32IZFINX-NEXT:    and a1, a2, a3
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -299,8 +297,7 @@ define i64 @test_ceil_si64(float %x) nounwind {
 ; RV32IF-NEXT:    addi sp, sp, -16
 ; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IF-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    fmv.s fs0, fa0
 ; RV32IF-NEXT:    lui a0, 307200
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
@@ -315,33 +312,32 @@ define i64 @test_ceil_si64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a0, 913408
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
-; RV32IF-NEXT:    neg s1, s0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a2, %hi(.LCPI5_0)
-; RV32IF-NEXT:    flw fa5, %lo(.LCPI5_0)(a2)
-; RV32IF-NEXT:    and a0, s1, a0
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    feq.s a2, fs0, fs0
-; RV32IF-NEXT:    neg a2, a2
-; RV32IF-NEXT:    lui a5, 524288
 ; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB5_4
 ; RV32IF-NEXT:  # %bb.3:
-; RV32IF-NEXT:    mv a4, a1
+; RV32IF-NEXT:    mv a2, a1
 ; RV32IF-NEXT:  .LBB5_4:
-; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    lui a1, %hi(.LCPI5_0)
+; RV32IF-NEXT:    flw fa5, %lo(.LCPI5_0)(a1)
+; RV32IF-NEXT:    flt.s a3, fa5, fs0
 ; RV32IF-NEXT:    beqz a3, .LBB5_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a4, a5, -1
+; RV32IF-NEXT:    addi a2, a4, -1
 ; RV32IF-NEXT:  .LBB5_6:
-; RV32IF-NEXT:    and a1, a2, a4
+; RV32IF-NEXT:    feq.s a1, fs0, fs0
+; RV32IF-NEXT:    neg a4, a1
+; RV32IF-NEXT:    and a1, a4, a2
+; RV32IF-NEXT:    neg a2, s0
+; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    neg a2, a3
+; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    and a0, a4, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
 ;
@@ -379,23 +375,23 @@ define i64 @test_ceil_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI5_0)
 ; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI5_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s2, a0
-; RV32IZFINX-NEXT:    flt.s a3, a2, s0
-; RV32IZFINX-NEXT:    neg a2, a3
+; RV32IZFINX-NEXT:    flt.s a4, a2, s0
+; RV32IZFINX-NEXT:    neg a2, a4
 ; RV32IZFINX-NEXT:    or a0, a2, a0
 ; RV32IZFINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    lui a5, 524288
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB5_4
 ; RV32IZFINX-NEXT:  # %bb.3:
-; RV32IZFINX-NEXT:    mv a4, a1
+; RV32IZFINX-NEXT:    mv a3, a1
 ; RV32IZFINX-NEXT:  .LBB5_4:
 ; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    beqz a3, .LBB5_6
+; RV32IZFINX-NEXT:    beqz a4, .LBB5_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a4, a5, -1
+; RV32IZFINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINX-NEXT:  .LBB5_6:
-; RV32IZFINX-NEXT:    and a1, a2, a4
+; RV32IZFINX-NEXT:    and a1, a2, a3
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -561,8 +557,7 @@ define i64 @test_trunc_si64(float %x) nounwind {
 ; RV32IF-NEXT:    addi sp, sp, -16
 ; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IF-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    fmv.s fs0, fa0
 ; RV32IF-NEXT:    lui a0, 307200
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
@@ -577,33 +572,32 @@ define i64 @test_trunc_si64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a0, 913408
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
-; RV32IF-NEXT:    neg s1, s0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a2, %hi(.LCPI9_0)
-; RV32IF-NEXT:    flw fa5, %lo(.LCPI9_0)(a2)
-; RV32IF-NEXT:    and a0, s1, a0
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    feq.s a2, fs0, fs0
-; RV32IF-NEXT:    neg a2, a2
-; RV32IF-NEXT:    lui a5, 524288
 ; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB9_4
 ; RV32IF-NEXT:  # %bb.3:
-; RV32IF-NEXT:    mv a4, a1
+; RV32IF-NEXT:    mv a2, a1
 ; RV32IF-NEXT:  .LBB9_4:
-; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    lui a1, %hi(.LCPI9_0)
+; RV32IF-NEXT:    flw fa5, %lo(.LCPI9_0)(a1)
+; RV32IF-NEXT:    flt.s a3, fa5, fs0
 ; RV32IF-NEXT:    beqz a3, .LBB9_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a4, a5, -1
+; RV32IF-NEXT:    addi a2, a4, -1
 ; RV32IF-NEXT:  .LBB9_6:
-; RV32IF-NEXT:    and a1, a2, a4
+; RV32IF-NEXT:    feq.s a1, fs0, fs0
+; RV32IF-NEXT:    neg a4, a1
+; RV32IF-NEXT:    and a1, a4, a2
+; RV32IF-NEXT:    neg a2, s0
+; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    neg a2, a3
+; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    and a0, a4, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
 ;
@@ -641,23 +635,23 @@ define i64 @test_trunc_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI9_0)
 ; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI9_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s2, a0
-; RV32IZFINX-NEXT:    flt.s a3, a2, s0
-; RV32IZFINX-NEXT:    neg a2, a3
+; RV32IZFINX-NEXT:    flt.s a4, a2, s0
+; RV32IZFINX-NEXT:    neg a2, a4
 ; RV32IZFINX-NEXT:    or a0, a2, a0
 ; RV32IZFINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    lui a5, 524288
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB9_4
 ; RV32IZFINX-NEXT:  # %bb.3:
-; RV32IZFINX-NEXT:    mv a4, a1
+; RV32IZFINX-NEXT:    mv a3, a1
 ; RV32IZFINX-NEXT:  .LBB9_4:
 ; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    beqz a3, .LBB9_6
+; RV32IZFINX-NEXT:    beqz a4, .LBB9_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a4, a5, -1
+; RV32IZFINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINX-NEXT:  .LBB9_6:
-; RV32IZFINX-NEXT:    and a1, a2, a4
+; RV32IZFINX-NEXT:    and a1, a2, a3
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -823,8 +817,7 @@ define i64 @test_round_si64(float %x) nounwind {
 ; RV32IF-NEXT:    addi sp, sp, -16
 ; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IF-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    fmv.s fs0, fa0
 ; RV32IF-NEXT:    lui a0, 307200
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
@@ -839,33 +832,32 @@ define i64 @test_round_si64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a0, 913408
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
-; RV32IF-NEXT:    neg s1, s0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a2, %hi(.LCPI13_0)
-; RV32IF-NEXT:    flw fa5, %lo(.LCPI13_0)(a2)
-; RV32IF-NEXT:    and a0, s1, a0
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    feq.s a2, fs0, fs0
-; RV32IF-NEXT:    neg a2, a2
-; RV32IF-NEXT:    lui a5, 524288
 ; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB13_4
 ; RV32IF-NEXT:  # %bb.3:
-; RV32IF-NEXT:    mv a4, a1
+; RV32IF-NEXT:    mv a2, a1
 ; RV32IF-NEXT:  .LBB13_4:
-; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    lui a1, %hi(.LCPI13_0)
+; RV32IF-NEXT:    flw fa5, %lo(.LCPI13_0)(a1)
+; RV32IF-NEXT:    flt.s a3, fa5, fs0
 ; RV32IF-NEXT:    beqz a3, .LBB13_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a4, a5, -1
+; RV32IF-NEXT:    addi a2, a4, -1
 ; RV32IF-NEXT:  .LBB13_6:
-; RV32IF-NEXT:    and a1, a2, a4
+; RV32IF-NEXT:    feq.s a1, fs0, fs0
+; RV32IF-NEXT:    neg a4, a1
+; RV32IF-NEXT:    and a1, a4, a2
+; RV32IF-NEXT:    neg a2, s0
+; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    neg a2, a3
+; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    and a0, a4, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
 ;
@@ -903,23 +895,23 @@ define i64 @test_round_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI13_0)
 ; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI13_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s2, a0
-; RV32IZFINX-NEXT:    flt.s a3, a2, s0
-; RV32IZFINX-NEXT:    neg a2, a3
+; RV32IZFINX-NEXT:    flt.s a4, a2, s0
+; RV32IZFINX-NEXT:    neg a2, a4
 ; RV32IZFINX-NEXT:    or a0, a2, a0
 ; RV32IZFINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    lui a5, 524288
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB13_4
 ; RV32IZFINX-NEXT:  # %bb.3:
-; RV32IZFINX-NEXT:    mv a4, a1
+; RV32IZFINX-NEXT:    mv a3, a1
 ; RV32IZFINX-NEXT:  .LBB13_4:
 ; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    beqz a3, .LBB13_6
+; RV32IZFINX-NEXT:    beqz a4, .LBB13_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a4, a5, -1
+; RV32IZFINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINX-NEXT:  .LBB13_6:
-; RV32IZFINX-NEXT:    and a1, a2, a4
+; RV32IZFINX-NEXT:    and a1, a2, a3
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1085,8 +1077,7 @@ define i64 @test_roundeven_si64(float %x) nounwind {
 ; RV32IF-NEXT:    addi sp, sp, -16
 ; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IF-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    fmv.s fs0, fa0
 ; RV32IF-NEXT:    lui a0, 307200
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
@@ -1101,33 +1092,32 @@ define i64 @test_roundeven_si64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a0, 913408
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
-; RV32IF-NEXT:    neg s1, s0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a2, %hi(.LCPI17_0)
-; RV32IF-NEXT:    flw fa5, %lo(.LCPI17_0)(a2)
-; RV32IF-NEXT:    and a0, s1, a0
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    feq.s a2, fs0, fs0
-; RV32IF-NEXT:    neg a2, a2
-; RV32IF-NEXT:    lui a5, 524288
 ; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB17_4
 ; RV32IF-NEXT:  # %bb.3:
-; RV32IF-NEXT:    mv a4, a1
+; RV32IF-NEXT:    mv a2, a1
 ; RV32IF-NEXT:  .LBB17_4:
-; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    lui a1, %hi(.LCPI17_0)
+; RV32IF-NEXT:    flw fa5, %lo(.LCPI17_0)(a1)
+; RV32IF-NEXT:    flt.s a3, fa5, fs0
 ; RV32IF-NEXT:    beqz a3, .LBB17_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a4, a5, -1
+; RV32IF-NEXT:    addi a2, a4, -1
 ; RV32IF-NEXT:  .LBB17_6:
-; RV32IF-NEXT:    and a1, a2, a4
+; RV32IF-NEXT:    feq.s a1, fs0, fs0
+; RV32IF-NEXT:    neg a4, a1
+; RV32IF-NEXT:    and a1, a4, a2
+; RV32IF-NEXT:    neg a2, s0
+; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    neg a2, a3
+; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    and a0, a4, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
 ;
@@ -1165,23 +1155,23 @@ define i64 @test_roundeven_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI17_0)
 ; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI17_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s2, a0
-; RV32IZFINX-NEXT:    flt.s a3, a2, s0
-; RV32IZFINX-NEXT:    neg a2, a3
+; RV32IZFINX-NEXT:    flt.s a4, a2, s0
+; RV32IZFINX-NEXT:    neg a2, a4
 ; RV32IZFINX-NEXT:    or a0, a2, a0
 ; RV32IZFINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    lui a5, 524288
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB17_4
 ; RV32IZFINX-NEXT:  # %bb.3:
-; RV32IZFINX-NEXT:    mv a4, a1
+; RV32IZFINX-NEXT:    mv a3, a1
 ; RV32IZFINX-NEXT:  .LBB17_4:
 ; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    beqz a3, .LBB17_6
+; RV32IZFINX-NEXT:    beqz a4, .LBB17_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a4, a5, -1
+; RV32IZFINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINX-NEXT:  .LBB17_6:
-; RV32IZFINX-NEXT:    and a1, a2, a4
+; RV32IZFINX-NEXT:    and a1, a2, a3
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1347,8 +1337,7 @@ define i64 @test_rint_si64(float %x) nounwind {
 ; RV32IF-NEXT:    addi sp, sp, -16
 ; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IF-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    fmv.s fs0, fa0
 ; RV32IF-NEXT:    lui a0, 307200
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
@@ -1363,33 +1352,32 @@ define i64 @test_rint_si64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a0, 913408
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
-; RV32IF-NEXT:    neg s1, s0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a2, %hi(.LCPI21_0)
-; RV32IF-NEXT:    flw fa5, %lo(.LCPI21_0)(a2)
-; RV32IF-NEXT:    and a0, s1, a0
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    feq.s a2, fs0, fs0
-; RV32IF-NEXT:    neg a2, a2
-; RV32IF-NEXT:    lui a5, 524288
 ; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB21_4
 ; RV32IF-NEXT:  # %bb.3:
-; RV32IF-NEXT:    mv a4, a1
+; RV32IF-NEXT:    mv a2, a1
 ; RV32IF-NEXT:  .LBB21_4:
-; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    lui a1, %hi(.LCPI21_0)
+; RV32IF-NEXT:    flw fa5, %lo(.LCPI21_0)(a1)
+; RV32IF-NEXT:    flt.s a3, fa5, fs0
 ; RV32IF-NEXT:    beqz a3, .LBB21_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a4, a5, -1
+; RV32IF-NEXT:    addi a2, a4, -1
 ; RV32IF-NEXT:  .LBB21_6:
-; RV32IF-NEXT:    and a1, a2, a4
+; RV32IF-NEXT:    feq.s a1, fs0, fs0
+; RV32IF-NEXT:    neg a4, a1
+; RV32IF-NEXT:    and a1, a4, a2
+; RV32IF-NEXT:    neg a2, s0
+; RV32IF-NEXT:    and a0, a2, a0
+; RV32IF-NEXT:    neg a2, a3
+; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    and a0, a4, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IF-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 16
 ; RV32IF-NEXT:    ret
 ;
@@ -1427,23 +1415,23 @@ define i64 @test_rint_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    lui a2, %hi(.LCPI21_0)
 ; RV32IZFINX-NEXT:    lw a2, %lo(.LCPI21_0)(a2)
 ; RV32IZFINX-NEXT:    and a0, s2, a0
-; RV32IZFINX-NEXT:    flt.s a3, a2, s0
-; RV32IZFINX-NEXT:    neg a2, a3
+; RV32IZFINX-NEXT:    flt.s a4, a2, s0
+; RV32IZFINX-NEXT:    neg a2, a4
 ; RV32IZFINX-NEXT:    or a0, a2, a0
 ; RV32IZFINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    lui a5, 524288
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB21_4
 ; RV32IZFINX-NEXT:  # %bb.3:
-; RV32IZFINX-NEXT:    mv a4, a1
+; RV32IZFINX-NEXT:    mv a3, a1
 ; RV32IZFINX-NEXT:  .LBB21_4:
 ; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    beqz a3, .LBB21_6
+; RV32IZFINX-NEXT:    beqz a4, .LBB21_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a4, a5, -1
+; RV32IZFINX-NEXT:    addi a3, a5, -1
 ; RV32IZFINX-NEXT:  .LBB21_6:
-; RV32IZFINX-NEXT:    and a1, a2, a4
+; RV32IZFINX-NEXT:    and a1, a2, a3
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/forced-atomics.ll b/llvm/test/CodeGen/RISCV/forced-atomics.ll
index c303690aadff..f6a53a9d76dd 100644
--- a/llvm/test/CodeGen/RISCV/forced-atomics.ll
+++ b/llvm/test/CodeGen/RISCV/forced-atomics.ll
@@ -3567,8 +3567,8 @@ define i64 @rmw64_umax_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    # in Loop: Header=BB51_2 Depth=1
 ; RV32-NEXT:    neg a3, a0
 ; RV32-NEXT:    and a3, a3, a1
-; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    sw a4, 0(sp)
+; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
@@ -3659,8 +3659,8 @@ define i64 @rmw64_umin_seq_cst(ptr %p) nounwind {
 ; RV32-NEXT:    # in Loop: Header=BB52_2 Depth=1
 ; RV32-NEXT:    neg a3, a0
 ; RV32-NEXT:    and a3, a3, a1
-; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    sw a4, 0(sp)
+; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index 06ab813faf02..deb5a6d4013d 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -114,8 +114,8 @@ define i32 @utest_f64i32(double %x) {
 ; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IF-NEXT:    .cfi_offset ra, -4
 ; RV32IF-NEXT:    call __fixunsdfdi
-; RV32IF-NEXT:    seqz a1, a1
 ; RV32IF-NEXT:    sltiu a2, a0, -1
+; RV32IF-NEXT:    seqz a1, a1
 ; RV32IF-NEXT:    and a1, a1, a2
 ; RV32IF-NEXT:    addi a1, a1, -1
 ; RV32IF-NEXT:    or a0, a1, a0
@@ -429,8 +429,8 @@ define i32 @utesth_f16i32(half %x) {
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    call __extendhfsf2
 ; RV32-NEXT:    call __fixunssfdi
-; RV32-NEXT:    seqz a1, a1
 ; RV32-NEXT:    sltiu a2, a0, -1
+; RV32-NEXT:    seqz a1, a1
 ; RV32-NEXT:    and a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    or a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll
index 277749c75bbb..31fb6e2ee9c8 100644
--- a/llvm/test/CodeGen/RISCV/half-convert.ll
+++ b/llvm/test/CodeGen/RISCV/half-convert.ll
@@ -2145,47 +2145,41 @@ define i64 @fcvt_l_h(half %a) nounwind {
 define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IZFH-LABEL: fcvt_l_h_sat:
 ; RV32IZFH:       # %bb.0: # %start
-; RV32IZFH-NEXT:    addi sp, sp, -32
-; RV32IZFH-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    fsw fs0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    lui a0, %hi(.LCPI10_0)
-; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI10_0)(a0)
+; RV32IZFH-NEXT:    addi sp, sp, -16
+; RV32IZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    fcvt.s.h fs0, fa0
-; RV32IZFH-NEXT:    flt.s s0, fa5, fs0
-; RV32IZFH-NEXT:    neg s1, s0
 ; RV32IZFH-NEXT:    lui a0, 913408
 ; RV32IZFH-NEXT:    fmv.w.x fa5, a0
-; RV32IZFH-NEXT:    fle.s s2, fa5, fs0
-; RV32IZFH-NEXT:    neg s3, s2
+; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    and a0, s3, a0
-; RV32IZFH-NEXT:    or a0, s1, a0
-; RV32IZFH-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFH-NEXT:    neg a2, a2
 ; RV32IZFH-NEXT:    lui a4, 524288
-; RV32IZFH-NEXT:    lui a3, 524288
-; RV32IZFH-NEXT:    beqz s2, .LBB10_2
+; RV32IZFH-NEXT:    lui a2, 524288
+; RV32IZFH-NEXT:    beqz s0, .LBB10_2
 ; RV32IZFH-NEXT:  # %bb.1: # %start
-; RV32IZFH-NEXT:    mv a3, a1
+; RV32IZFH-NEXT:    mv a2, a1
 ; RV32IZFH-NEXT:  .LBB10_2: # %start
-; RV32IZFH-NEXT:    and a0, a2, a0
-; RV32IZFH-NEXT:    beqz s0, .LBB10_4
+; RV32IZFH-NEXT:    lui a1, %hi(.LCPI10_0)
+; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
+; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFH-NEXT:    beqz a3, .LBB10_4
 ; RV32IZFH-NEXT:  # %bb.3:
-; RV32IZFH-NEXT:    addi a3, a4, -1
+; RV32IZFH-NEXT:    addi a2, a4, -1
 ; RV32IZFH-NEXT:  .LBB10_4: # %start
-; RV32IZFH-NEXT:    and a1, a2, a3
-; RV32IZFH-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    flw fs0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    addi sp, sp, 32
+; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, a1
+; RV32IZFH-NEXT:    and a1, a4, a2
+; RV32IZFH-NEXT:    neg a2, a3
+; RV32IZFH-NEXT:    neg a3, s0
+; RV32IZFH-NEXT:    and a0, a3, a0
+; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    and a0, a4, a0
+; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT:    addi sp, sp, 16
 ; RV32IZFH-NEXT:    ret
 ;
 ; RV64IZFH-LABEL: fcvt_l_h_sat:
@@ -2199,47 +2193,41 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ;
 ; RV32IDZFH-LABEL: fcvt_l_h_sat:
 ; RV32IDZFH:       # %bb.0: # %start
-; RV32IDZFH-NEXT:    addi sp, sp, -32
-; RV32IDZFH-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32IDZFH-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32IDZFH-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32IDZFH-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IDZFH-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32IDZFH-NEXT:    addi sp, sp, -16
+; RV32IDZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IDZFH-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IDZFH-NEXT:    fsd fs0, 0(sp) # 8-byte Folded Spill
-; RV32IDZFH-NEXT:    lui a0, %hi(.LCPI10_0)
-; RV32IDZFH-NEXT:    flw fa5, %lo(.LCPI10_0)(a0)
 ; RV32IDZFH-NEXT:    fcvt.s.h fs0, fa0
-; RV32IDZFH-NEXT:    flt.s s0, fa5, fs0
-; RV32IDZFH-NEXT:    neg s1, s0
 ; RV32IDZFH-NEXT:    lui a0, 913408
 ; RV32IDZFH-NEXT:    fmv.w.x fa5, a0
-; RV32IDZFH-NEXT:    fle.s s2, fa5, fs0
-; RV32IDZFH-NEXT:    neg s3, s2
+; RV32IDZFH-NEXT:    fle.s s0, fa5, fs0
 ; RV32IDZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IDZFH-NEXT:    call __fixsfdi
-; RV32IDZFH-NEXT:    and a0, s3, a0
-; RV32IDZFH-NEXT:    or a0, s1, a0
-; RV32IDZFH-NEXT:    feq.s a2, fs0, fs0
-; RV32IDZFH-NEXT:    neg a2, a2
 ; RV32IDZFH-NEXT:    lui a4, 524288
-; RV32IDZFH-NEXT:    lui a3, 524288
-; RV32IDZFH-NEXT:    beqz s2, .LBB10_2
+; RV32IDZFH-NEXT:    lui a2, 524288
+; RV32IDZFH-NEXT:    beqz s0, .LBB10_2
 ; RV32IDZFH-NEXT:  # %bb.1: # %start
-; RV32IDZFH-NEXT:    mv a3, a1
+; RV32IDZFH-NEXT:    mv a2, a1
 ; RV32IDZFH-NEXT:  .LBB10_2: # %start
-; RV32IDZFH-NEXT:    and a0, a2, a0
-; RV32IDZFH-NEXT:    beqz s0, .LBB10_4
+; RV32IDZFH-NEXT:    lui a1, %hi(.LCPI10_0)
+; RV32IDZFH-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
+; RV32IDZFH-NEXT:    flt.s a3, fa5, fs0
+; RV32IDZFH-NEXT:    beqz a3, .LBB10_4
 ; RV32IDZFH-NEXT:  # %bb.3:
-; RV32IDZFH-NEXT:    addi a3, a4, -1
+; RV32IDZFH-NEXT:    addi a2, a4, -1
 ; RV32IDZFH-NEXT:  .LBB10_4: # %start
-; RV32IDZFH-NEXT:    and a1, a2, a3
-; RV32IDZFH-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32IDZFH-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32IDZFH-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32IDZFH-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32IDZFH-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32IDZFH-NEXT:    feq.s a1, fs0, fs0
+; RV32IDZFH-NEXT:    neg a4, a1
+; RV32IDZFH-NEXT:    and a1, a4, a2
+; RV32IDZFH-NEXT:    neg a2, a3
+; RV32IDZFH-NEXT:    neg a3, s0
+; RV32IDZFH-NEXT:    and a0, a3, a0
+; RV32IDZFH-NEXT:    or a0, a2, a0
+; RV32IDZFH-NEXT:    and a0, a4, a0
+; RV32IDZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IDZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IDZFH-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
-; RV32IDZFH-NEXT:    addi sp, sp, 32
+; RV32IDZFH-NEXT:    addi sp, sp, 16
 ; RV32IDZFH-NEXT:    ret
 ;
 ; RV64IDZFH-LABEL: fcvt_l_h_sat:
@@ -2515,47 +2503,41 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ;
 ; RV32ID-LABEL: fcvt_l_h_sat:
 ; RV32ID:       # %bb.0: # %start
-; RV32ID-NEXT:    addi sp, sp, -32
-; RV32ID-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32ID-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32ID-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32ID-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32ID-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32ID-NEXT:    addi sp, sp, -16
+; RV32ID-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32ID-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    fsd fs0, 0(sp) # 8-byte Folded Spill
 ; RV32ID-NEXT:    call __extendhfsf2
-; RV32ID-NEXT:    lui a0, %hi(.LCPI10_0)
-; RV32ID-NEXT:    flw fa5, %lo(.LCPI10_0)(a0)
 ; RV32ID-NEXT:    fmv.s fs0, fa0
-; RV32ID-NEXT:    flt.s s0, fa5, fa0
-; RV32ID-NEXT:    neg s1, s0
 ; RV32ID-NEXT:    lui a0, 913408
 ; RV32ID-NEXT:    fmv.w.x fa5, a0
-; RV32ID-NEXT:    fle.s s2, fa5, fa0
-; RV32ID-NEXT:    neg s3, s2
+; RV32ID-NEXT:    fle.s s0, fa5, fa0
 ; RV32ID-NEXT:    call __fixsfdi
-; RV32ID-NEXT:    and a0, s3, a0
-; RV32ID-NEXT:    or a0, s1, a0
-; RV32ID-NEXT:    feq.s a2, fs0, fs0
-; RV32ID-NEXT:    neg a2, a2
 ; RV32ID-NEXT:    lui a4, 524288
-; RV32ID-NEXT:    lui a3, 524288
-; RV32ID-NEXT:    beqz s2, .LBB10_2
+; RV32ID-NEXT:    lui a2, 524288
+; RV32ID-NEXT:    beqz s0, .LBB10_2
 ; RV32ID-NEXT:  # %bb.1: # %start
-; RV32ID-NEXT:    mv a3, a1
+; RV32ID-NEXT:    mv a2, a1
 ; RV32ID-NEXT:  .LBB10_2: # %start
-; RV32ID-NEXT:    and a0, a2, a0
-; RV32ID-NEXT:    beqz s0, .LBB10_4
+; RV32ID-NEXT:    lui a1, %hi(.LCPI10_0)
+; RV32ID-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
+; RV32ID-NEXT:    flt.s a3, fa5, fs0
+; RV32ID-NEXT:    beqz a3, .LBB10_4
 ; RV32ID-NEXT:  # %bb.3:
-; RV32ID-NEXT:    addi a3, a4, -1
+; RV32ID-NEXT:    addi a2, a4, -1
 ; RV32ID-NEXT:  .LBB10_4: # %start
-; RV32ID-NEXT:    and a1, a2, a3
-; RV32ID-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32ID-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32ID-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32ID-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32ID-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32ID-NEXT:    feq.s a1, fs0, fs0
+; RV32ID-NEXT:    neg a4, a1
+; RV32ID-NEXT:    and a1, a4, a2
+; RV32ID-NEXT:    neg a2, s0
+; RV32ID-NEXT:    and a0, a2, a0
+; RV32ID-NEXT:    neg a2, a3
+; RV32ID-NEXT:    or a0, a2, a0
+; RV32ID-NEXT:    and a0, a4, a0
+; RV32ID-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32ID-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
-; RV32ID-NEXT:    addi sp, sp, 32
+; RV32ID-NEXT:    addi sp, sp, 16
 ; RV32ID-NEXT:    ret
 ;
 ; RV64ID-LABEL: fcvt_l_h_sat:
@@ -2574,47 +2556,41 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ;
 ; RV32IFZFHMIN-LABEL: fcvt_l_h_sat:
 ; RV32IFZFHMIN:       # %bb.0: # %start
-; RV32IFZFHMIN-NEXT:    addi sp, sp, -32
-; RV32IFZFHMIN-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32IFZFHMIN-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32IFZFHMIN-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32IFZFHMIN-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IFZFHMIN-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32IFZFHMIN-NEXT:    fsw fs0, 8(sp) # 4-byte Folded Spill
-; RV32IFZFHMIN-NEXT:    lui a0, %hi(.LCPI10_0)
-; RV32IFZFHMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a0)
+; RV32IFZFHMIN-NEXT:    addi sp, sp, -16
+; RV32IFZFHMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IFZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32IFZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IFZFHMIN-NEXT:    fcvt.s.h fs0, fa0
-; RV32IFZFHMIN-NEXT:    flt.s s0, fa5, fs0
-; RV32IFZFHMIN-NEXT:    neg s1, s0
 ; RV32IFZFHMIN-NEXT:    lui a0, 913408
 ; RV32IFZFHMIN-NEXT:    fmv.w.x fa5, a0
-; RV32IFZFHMIN-NEXT:    fle.s s2, fa5, fs0
-; RV32IFZFHMIN-NEXT:    neg s3, s2
+; RV32IFZFHMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IFZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IFZFHMIN-NEXT:    call __fixsfdi
-; RV32IFZFHMIN-NEXT:    and a0, s3, a0
-; RV32IFZFHMIN-NEXT:    or a0, s1, a0
-; RV32IFZFHMIN-NEXT:    feq.s a2, fs0, fs0
-; RV32IFZFHMIN-NEXT:    neg a2, a2
 ; RV32IFZFHMIN-NEXT:    lui a4, 524288
-; RV32IFZFHMIN-NEXT:    lui a3, 524288
-; RV32IFZFHMIN-NEXT:    beqz s2, .LBB10_2
+; RV32IFZFHMIN-NEXT:    lui a2, 524288
+; RV32IFZFHMIN-NEXT:    beqz s0, .LBB10_2
 ; RV32IFZFHMIN-NEXT:  # %bb.1: # %start
-; RV32IFZFHMIN-NEXT:    mv a3, a1
+; RV32IFZFHMIN-NEXT:    mv a2, a1
 ; RV32IFZFHMIN-NEXT:  .LBB10_2: # %start
-; RV32IFZFHMIN-NEXT:    and a0, a2, a0
-; RV32IFZFHMIN-NEXT:    beqz s0, .LBB10_4
+; RV32IFZFHMIN-NEXT:    lui a1, %hi(.LCPI10_0)
+; RV32IFZFHMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
+; RV32IFZFHMIN-NEXT:    flt.s a3, fa5, fs0
+; RV32IFZFHMIN-NEXT:    beqz a3, .LBB10_4
 ; RV32IFZFHMIN-NEXT:  # %bb.3:
-; RV32IFZFHMIN-NEXT:    addi a3, a4, -1
+; RV32IFZFHMIN-NEXT:    addi a2, a4, -1
 ; RV32IFZFHMIN-NEXT:  .LBB10_4: # %start
-; RV32IFZFHMIN-NEXT:    and a1, a2, a3
-; RV32IFZFHMIN-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32IFZFHMIN-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32IFZFHMIN-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32IFZFHMIN-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32IFZFHMIN-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
-; RV32IFZFHMIN-NEXT:    flw fs0, 8(sp) # 4-byte Folded Reload
-; RV32IFZFHMIN-NEXT:    addi sp, sp, 32
+; RV32IFZFHMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IFZFHMIN-NEXT:    neg a4, a1
+; RV32IFZFHMIN-NEXT:    and a1, a4, a2
+; RV32IFZFHMIN-NEXT:    neg a2, a3
+; RV32IFZFHMIN-NEXT:    neg a3, s0
+; RV32IFZFHMIN-NEXT:    and a0, a3, a0
+; RV32IFZFHMIN-NEXT:    or a0, a2, a0
+; RV32IFZFHMIN-NEXT:    and a0, a4, a0
+; RV32IFZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IFZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32IFZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
+; RV32IFZFHMIN-NEXT:    addi sp, sp, 16
 ; RV32IFZFHMIN-NEXT:    ret
 ;
 ; CHECK64-IZFHMIN-LABEL: fcvt_l_h_sat:
@@ -2629,47 +2605,41 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ;
 ; RV32IDZFHMIN-LABEL: fcvt_l_h_sat:
 ; RV32IDZFHMIN:       # %bb.0: # %start
-; RV32IDZFHMIN-NEXT:    addi sp, sp, -32
-; RV32IDZFHMIN-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32IDZFHMIN-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; RV32IDZFHMIN-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
-; RV32IDZFHMIN-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IDZFHMIN-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32IDZFHMIN-NEXT:    addi sp, sp, -16
+; RV32IDZFHMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IDZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IDZFHMIN-NEXT:    fsd fs0, 0(sp) # 8-byte Folded Spill
-; RV32IDZFHMIN-NEXT:    lui a0, %hi(.LCPI10_0)
-; RV32IDZFHMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a0)
 ; RV32IDZFHMIN-NEXT:    fcvt.s.h fs0, fa0
-; RV32IDZFHMIN-NEXT:    flt.s s0, fa5, fs0
-; RV32IDZFHMIN-NEXT:    neg s1, s0
 ; RV32IDZFHMIN-NEXT:    lui a0, 913408
 ; RV32IDZFHMIN-NEXT:    fmv.w.x fa5, a0
-; RV32IDZFHMIN-NEXT:    fle.s s2, fa5, fs0
-; RV32IDZFHMIN-NEXT:    neg s3, s2
+; RV32IDZFHMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IDZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IDZFHMIN-NEXT:    call __fixsfdi
-; RV32IDZFHMIN-NEXT:    and a0, s3, a0
-; RV32IDZFHMIN-NEXT:    or a0, s1, a0
-; RV32IDZFHMIN-NEXT:    feq.s a2, fs0, fs0
-; RV32IDZFHMIN-NEXT:    neg a2, a2
 ; RV32IDZFHMIN-NEXT:    lui a4, 524288
-; RV32IDZFHMIN-NEXT:    lui a3, 524288
-; RV32IDZFHMIN-NEXT:    beqz s2, .LBB10_2
+; RV32IDZFHMIN-NEXT:    lui a2, 524288
+; RV32IDZFHMIN-NEXT:    beqz s0, .LBB10_2
 ; RV32IDZFHMIN-NEXT:  # %bb.1: # %start
-; RV32IDZFHMIN-NEXT:    mv a3, a1
+; RV32IDZFHMIN-NEXT:    mv a2, a1
 ; RV32IDZFHMIN-NEXT:  .LBB10_2: # %start
-; RV32IDZFHMIN-NEXT:    and a0, a2, a0
-; RV32IDZFHMIN-NEXT:    beqz s0, .LBB10_4
+; RV32IDZFHMIN-NEXT:    lui a1, %hi(.LCPI10_0)
+; RV32IDZFHMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
+; RV32IDZFHMIN-NEXT:    flt.s a3, fa5, fs0
+; RV32IDZFHMIN-NEXT:    beqz a3, .LBB10_4
 ; RV32IDZFHMIN-NEXT:  # %bb.3:
-; RV32IDZFHMIN-NEXT:    addi a3, a4, -1
+; RV32IDZFHMIN-NEXT:    addi a2, a4, -1
 ; RV32IDZFHMIN-NEXT:  .LBB10_4: # %start
-; RV32IDZFHMIN-NEXT:    and a1, a2, a3
-; RV32IDZFHMIN-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; RV32IDZFHMIN-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; RV32IDZFHMIN-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
-; RV32IDZFHMIN-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
-; RV32IDZFHMIN-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32IDZFHMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IDZFHMIN-NEXT:    neg a4, a1
+; RV32IDZFHMIN-NEXT:    and a1, a4, a2
+; RV32IDZFHMIN-NEXT:    neg a2, a3
+; RV32IDZFHMIN-NEXT:    neg a3, s0
+; RV32IDZFHMIN-NEXT:    and a0, a3, a0
+; RV32IDZFHMIN-NEXT:    or a0, a2, a0
+; RV32IDZFHMIN-NEXT:    and a0, a4, a0
+; RV32IDZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IDZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IDZFHMIN-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
-; RV32IDZFHMIN-NEXT:    addi sp, sp, 32
+; RV32IDZFHMIN-NEXT:    addi sp, sp, 16
 ; RV32IDZFHMIN-NEXT:    ret
 ;
 ; CHECK32-IZHINXMIN-LABEL: fcvt_l_h_sat:
diff --git a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
index 9c95210bfa7c..04a8a66f4459 100644
--- a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
@@ -108,40 +108,38 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    addi sp, sp, -16
 ; RV32IZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    fcvt.s.h fs0, fa0
 ; RV32IZFH-NEXT:    lui a0, 913408
 ; RV32IZFH-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFH-NEXT:    neg s1, s0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a2, %hi(.LCPI1_1)
-; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI1_1)(a2)
-; RV32IZFH-NEXT:    and a0, s1, a0
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFH-NEXT:    neg a2, a2
-; RV32IZFH-NEXT:    lui a5, 524288
 ; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB1_4
 ; RV32IZFH-NEXT:  # %bb.3:
-; RV32IZFH-NEXT:    mv a4, a1
+; RV32IZFH-NEXT:    mv a2, a1
 ; RV32IZFH-NEXT:  .LBB1_4:
+; RV32IZFH-NEXT:    lui a1, %hi(.LCPI1_1)
+; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI1_1)(a1)
+; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFH-NEXT:    beqz a3, .LBB1_6
+; RV32IZFH-NEXT:  # %bb.5:
+; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:  .LBB1_6:
+; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, a1
+; RV32IZFH-NEXT:    and a1, a4, a2
+; RV32IZFH-NEXT:    neg a2, s0
 ; RV32IZFH-NEXT:    and a0, a2, a0
+; RV32IZFH-NEXT:    neg a2, a3
+; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    and a0, a4, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    addi sp, sp, 16
-; RV32IZFH-NEXT:    beqz a3, .LBB1_6
-; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a4, a5, -1
-; RV32IZFH-NEXT:  .LBB1_6:
-; RV32IZFH-NEXT:    and a1, a2, a4
 ; RV32IZFH-NEXT:    ret
 ;
 ; RV64IZFH-LABEL: test_floor_si64:
@@ -179,16 +177,16 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI1_1)
 ; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI1_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s2, a0
-; RV32IZHINX-NEXT:    flt.s a3, a2, s0
-; RV32IZHINX-NEXT:    neg a2, a3
+; RV32IZHINX-NEXT:    flt.s a4, a2, s0
+; RV32IZHINX-NEXT:    neg a2, a4
 ; RV32IZHINX-NEXT:    or a0, a2, a0
 ; RV32IZHINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    lui a5, 524288
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB1_4
 ; RV32IZHINX-NEXT:  # %bb.3:
-; RV32IZHINX-NEXT:    mv a4, a1
+; RV32IZHINX-NEXT:    mv a3, a1
 ; RV32IZHINX-NEXT:  .LBB1_4:
 ; RV32IZHINX-NEXT:    and a0, a2, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -196,11 +194,11 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    addi sp, sp, 16
-; RV32IZHINX-NEXT:    beqz a3, .LBB1_6
+; RV32IZHINX-NEXT:    beqz a4, .LBB1_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a4, a5, -1
+; RV32IZHINX-NEXT:    addi a3, a5, -1
 ; RV32IZHINX-NEXT:  .LBB1_6:
-; RV32IZHINX-NEXT:    and a1, a2, a4
+; RV32IZHINX-NEXT:    and a1, a2, a3
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: test_floor_si64:
@@ -238,41 +236,39 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    addi sp, sp, -16
 ; RV32IZFHMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg s1, s0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI1_0)
-; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI1_0)(a2)
-; RV32IZFHMIN-NEXT:    and a0, s1, a0
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a2
-; RV32IZFHMIN-NEXT:    lui a5, 524288
 ; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB1_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
-; RV32IZFHMIN-NEXT:    mv a4, a1
+; RV32IZFHMIN-NEXT:    mv a2, a1
 ; RV32IZFHMIN-NEXT:  .LBB1_4:
+; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI1_0)
+; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI1_0)(a1)
+; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a3, .LBB1_6
+; RV32IZFHMIN-NEXT:  # %bb.5:
+; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:  .LBB1_6:
+; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, a1
+; RV32IZFHMIN-NEXT:    and a1, a4, a2
+; RV32IZFHMIN-NEXT:    neg a2, s0
 ; RV32IZFHMIN-NEXT:    and a0, a2, a0
+; RV32IZFHMIN-NEXT:    neg a2, a3
+; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    and a0, a4, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    addi sp, sp, 16
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB1_6
-; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a4, a5, -1
-; RV32IZFHMIN-NEXT:  .LBB1_6:
-; RV32IZFHMIN-NEXT:    and a1, a2, a4
 ; RV32IZFHMIN-NEXT:    ret
 ;
 ; RV64IZFHMIN-LABEL: test_floor_si64:
@@ -324,16 +320,16 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI1_0)
 ; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI1_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s2, a0
-; RV32IZHINXMIN-NEXT:    flt.s a3, a2, s0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
+; RV32IZHINXMIN-NEXT:    flt.s a4, a2, s0
+; RV32IZHINXMIN-NEXT:    neg a2, a4
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    lui a5, 524288
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB1_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
-; RV32IZHINXMIN-NEXT:    mv a4, a1
+; RV32IZHINXMIN-NEXT:    mv a3, a1
 ; RV32IZHINXMIN-NEXT:  .LBB1_4:
 ; RV32IZHINXMIN-NEXT:    and a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -341,11 +337,11 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB1_6
+; RV32IZHINXMIN-NEXT:    beqz a4, .LBB1_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a4, a5, -1
+; RV32IZHINXMIN-NEXT:    addi a3, a5, -1
 ; RV32IZHINXMIN-NEXT:  .LBB1_6:
-; RV32IZHINXMIN-NEXT:    and a1, a2, a4
+; RV32IZHINXMIN-NEXT:    and a1, a2, a3
 ; RV32IZHINXMIN-NEXT:    ret
 ;
 ; RV64IZHINXMIN-LABEL: test_floor_si64:
@@ -824,40 +820,38 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    addi sp, sp, -16
 ; RV32IZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    fcvt.s.h fs0, fa0
 ; RV32IZFH-NEXT:    lui a0, 913408
 ; RV32IZFH-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFH-NEXT:    neg s1, s0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a2, %hi(.LCPI5_1)
-; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI5_1)(a2)
-; RV32IZFH-NEXT:    and a0, s1, a0
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFH-NEXT:    neg a2, a2
-; RV32IZFH-NEXT:    lui a5, 524288
 ; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB5_4
 ; RV32IZFH-NEXT:  # %bb.3:
-; RV32IZFH-NEXT:    mv a4, a1
+; RV32IZFH-NEXT:    mv a2, a1
 ; RV32IZFH-NEXT:  .LBB5_4:
+; RV32IZFH-NEXT:    lui a1, %hi(.LCPI5_1)
+; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI5_1)(a1)
+; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFH-NEXT:    beqz a3, .LBB5_6
+; RV32IZFH-NEXT:  # %bb.5:
+; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:  .LBB5_6:
+; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, a1
+; RV32IZFH-NEXT:    and a1, a4, a2
+; RV32IZFH-NEXT:    neg a2, s0
 ; RV32IZFH-NEXT:    and a0, a2, a0
+; RV32IZFH-NEXT:    neg a2, a3
+; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    and a0, a4, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    addi sp, sp, 16
-; RV32IZFH-NEXT:    beqz a3, .LBB5_6
-; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a4, a5, -1
-; RV32IZFH-NEXT:  .LBB5_6:
-; RV32IZFH-NEXT:    and a1, a2, a4
 ; RV32IZFH-NEXT:    ret
 ;
 ; RV64IZFH-LABEL: test_ceil_si64:
@@ -895,16 +889,16 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI5_1)
 ; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI5_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s2, a0
-; RV32IZHINX-NEXT:    flt.s a3, a2, s0
-; RV32IZHINX-NEXT:    neg a2, a3
+; RV32IZHINX-NEXT:    flt.s a4, a2, s0
+; RV32IZHINX-NEXT:    neg a2, a4
 ; RV32IZHINX-NEXT:    or a0, a2, a0
 ; RV32IZHINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    lui a5, 524288
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB5_4
 ; RV32IZHINX-NEXT:  # %bb.3:
-; RV32IZHINX-NEXT:    mv a4, a1
+; RV32IZHINX-NEXT:    mv a3, a1
 ; RV32IZHINX-NEXT:  .LBB5_4:
 ; RV32IZHINX-NEXT:    and a0, a2, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -912,11 +906,11 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    addi sp, sp, 16
-; RV32IZHINX-NEXT:    beqz a3, .LBB5_6
+; RV32IZHINX-NEXT:    beqz a4, .LBB5_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a4, a5, -1
+; RV32IZHINX-NEXT:    addi a3, a5, -1
 ; RV32IZHINX-NEXT:  .LBB5_6:
-; RV32IZHINX-NEXT:    and a1, a2, a4
+; RV32IZHINX-NEXT:    and a1, a2, a3
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: test_ceil_si64:
@@ -954,41 +948,39 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    addi sp, sp, -16
 ; RV32IZFHMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg s1, s0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI5_0)
-; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI5_0)(a2)
-; RV32IZFHMIN-NEXT:    and a0, s1, a0
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a2
-; RV32IZFHMIN-NEXT:    lui a5, 524288
 ; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB5_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
-; RV32IZFHMIN-NEXT:    mv a4, a1
+; RV32IZFHMIN-NEXT:    mv a2, a1
 ; RV32IZFHMIN-NEXT:  .LBB5_4:
+; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI5_0)
+; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI5_0)(a1)
+; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a3, .LBB5_6
+; RV32IZFHMIN-NEXT:  # %bb.5:
+; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:  .LBB5_6:
+; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, a1
+; RV32IZFHMIN-NEXT:    and a1, a4, a2
+; RV32IZFHMIN-NEXT:    neg a2, s0
 ; RV32IZFHMIN-NEXT:    and a0, a2, a0
+; RV32IZFHMIN-NEXT:    neg a2, a3
+; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    and a0, a4, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    addi sp, sp, 16
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB5_6
-; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a4, a5, -1
-; RV32IZFHMIN-NEXT:  .LBB5_6:
-; RV32IZFHMIN-NEXT:    and a1, a2, a4
 ; RV32IZFHMIN-NEXT:    ret
 ;
 ; RV64IZFHMIN-LABEL: test_ceil_si64:
@@ -1040,16 +1032,16 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI5_0)
 ; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI5_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s2, a0
-; RV32IZHINXMIN-NEXT:    flt.s a3, a2, s0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
+; RV32IZHINXMIN-NEXT:    flt.s a4, a2, s0
+; RV32IZHINXMIN-NEXT:    neg a2, a4
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    lui a5, 524288
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB5_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
-; RV32IZHINXMIN-NEXT:    mv a4, a1
+; RV32IZHINXMIN-NEXT:    mv a3, a1
 ; RV32IZHINXMIN-NEXT:  .LBB5_4:
 ; RV32IZHINXMIN-NEXT:    and a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -1057,11 +1049,11 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB5_6
+; RV32IZHINXMIN-NEXT:    beqz a4, .LBB5_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a4, a5, -1
+; RV32IZHINXMIN-NEXT:    addi a3, a5, -1
 ; RV32IZHINXMIN-NEXT:  .LBB5_6:
-; RV32IZHINXMIN-NEXT:    and a1, a2, a4
+; RV32IZHINXMIN-NEXT:    and a1, a2, a3
 ; RV32IZHINXMIN-NEXT:    ret
 ;
 ; RV64IZHINXMIN-LABEL: test_ceil_si64:
@@ -1540,40 +1532,38 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    addi sp, sp, -16
 ; RV32IZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    fcvt.s.h fs0, fa0
 ; RV32IZFH-NEXT:    lui a0, 913408
 ; RV32IZFH-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFH-NEXT:    neg s1, s0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a2, %hi(.LCPI9_1)
-; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI9_1)(a2)
-; RV32IZFH-NEXT:    and a0, s1, a0
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFH-NEXT:    neg a2, a2
-; RV32IZFH-NEXT:    lui a5, 524288
 ; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB9_4
 ; RV32IZFH-NEXT:  # %bb.3:
-; RV32IZFH-NEXT:    mv a4, a1
+; RV32IZFH-NEXT:    mv a2, a1
 ; RV32IZFH-NEXT:  .LBB9_4:
+; RV32IZFH-NEXT:    lui a1, %hi(.LCPI9_1)
+; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI9_1)(a1)
+; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFH-NEXT:    beqz a3, .LBB9_6
+; RV32IZFH-NEXT:  # %bb.5:
+; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:  .LBB9_6:
+; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, a1
+; RV32IZFH-NEXT:    and a1, a4, a2
+; RV32IZFH-NEXT:    neg a2, s0
 ; RV32IZFH-NEXT:    and a0, a2, a0
+; RV32IZFH-NEXT:    neg a2, a3
+; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    and a0, a4, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    addi sp, sp, 16
-; RV32IZFH-NEXT:    beqz a3, .LBB9_6
-; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a4, a5, -1
-; RV32IZFH-NEXT:  .LBB9_6:
-; RV32IZFH-NEXT:    and a1, a2, a4
 ; RV32IZFH-NEXT:    ret
 ;
 ; RV64IZFH-LABEL: test_trunc_si64:
@@ -1611,16 +1601,16 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI9_1)
 ; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI9_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s2, a0
-; RV32IZHINX-NEXT:    flt.s a3, a2, s0
-; RV32IZHINX-NEXT:    neg a2, a3
+; RV32IZHINX-NEXT:    flt.s a4, a2, s0
+; RV32IZHINX-NEXT:    neg a2, a4
 ; RV32IZHINX-NEXT:    or a0, a2, a0
 ; RV32IZHINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    lui a5, 524288
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB9_4
 ; RV32IZHINX-NEXT:  # %bb.3:
-; RV32IZHINX-NEXT:    mv a4, a1
+; RV32IZHINX-NEXT:    mv a3, a1
 ; RV32IZHINX-NEXT:  .LBB9_4:
 ; RV32IZHINX-NEXT:    and a0, a2, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -1628,11 +1618,11 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    addi sp, sp, 16
-; RV32IZHINX-NEXT:    beqz a3, .LBB9_6
+; RV32IZHINX-NEXT:    beqz a4, .LBB9_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a4, a5, -1
+; RV32IZHINX-NEXT:    addi a3, a5, -1
 ; RV32IZHINX-NEXT:  .LBB9_6:
-; RV32IZHINX-NEXT:    and a1, a2, a4
+; RV32IZHINX-NEXT:    and a1, a2, a3
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: test_trunc_si64:
@@ -1670,41 +1660,39 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    addi sp, sp, -16
 ; RV32IZFHMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg s1, s0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI9_0)
-; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI9_0)(a2)
-; RV32IZFHMIN-NEXT:    and a0, s1, a0
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a2
-; RV32IZFHMIN-NEXT:    lui a5, 524288
 ; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB9_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
-; RV32IZFHMIN-NEXT:    mv a4, a1
+; RV32IZFHMIN-NEXT:    mv a2, a1
 ; RV32IZFHMIN-NEXT:  .LBB9_4:
+; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI9_0)
+; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI9_0)(a1)
+; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a3, .LBB9_6
+; RV32IZFHMIN-NEXT:  # %bb.5:
+; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:  .LBB9_6:
+; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, a1
+; RV32IZFHMIN-NEXT:    and a1, a4, a2
+; RV32IZFHMIN-NEXT:    neg a2, s0
 ; RV32IZFHMIN-NEXT:    and a0, a2, a0
+; RV32IZFHMIN-NEXT:    neg a2, a3
+; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    and a0, a4, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    addi sp, sp, 16
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB9_6
-; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a4, a5, -1
-; RV32IZFHMIN-NEXT:  .LBB9_6:
-; RV32IZFHMIN-NEXT:    and a1, a2, a4
 ; RV32IZFHMIN-NEXT:    ret
 ;
 ; RV64IZFHMIN-LABEL: test_trunc_si64:
@@ -1756,16 +1744,16 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI9_0)
 ; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI9_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s2, a0
-; RV32IZHINXMIN-NEXT:    flt.s a3, a2, s0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
+; RV32IZHINXMIN-NEXT:    flt.s a4, a2, s0
+; RV32IZHINXMIN-NEXT:    neg a2, a4
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    lui a5, 524288
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB9_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
-; RV32IZHINXMIN-NEXT:    mv a4, a1
+; RV32IZHINXMIN-NEXT:    mv a3, a1
 ; RV32IZHINXMIN-NEXT:  .LBB9_4:
 ; RV32IZHINXMIN-NEXT:    and a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -1773,11 +1761,11 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB9_6
+; RV32IZHINXMIN-NEXT:    beqz a4, .LBB9_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a4, a5, -1
+; RV32IZHINXMIN-NEXT:    addi a3, a5, -1
 ; RV32IZHINXMIN-NEXT:  .LBB9_6:
-; RV32IZHINXMIN-NEXT:    and a1, a2, a4
+; RV32IZHINXMIN-NEXT:    and a1, a2, a3
 ; RV32IZHINXMIN-NEXT:    ret
 ;
 ; RV64IZHINXMIN-LABEL: test_trunc_si64:
@@ -2256,40 +2244,38 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    addi sp, sp, -16
 ; RV32IZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    fcvt.s.h fs0, fa0
 ; RV32IZFH-NEXT:    lui a0, 913408
 ; RV32IZFH-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFH-NEXT:    neg s1, s0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a2, %hi(.LCPI13_1)
-; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI13_1)(a2)
-; RV32IZFH-NEXT:    and a0, s1, a0
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFH-NEXT:    neg a2, a2
-; RV32IZFH-NEXT:    lui a5, 524288
 ; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB13_4
 ; RV32IZFH-NEXT:  # %bb.3:
-; RV32IZFH-NEXT:    mv a4, a1
+; RV32IZFH-NEXT:    mv a2, a1
 ; RV32IZFH-NEXT:  .LBB13_4:
+; RV32IZFH-NEXT:    lui a1, %hi(.LCPI13_1)
+; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI13_1)(a1)
+; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFH-NEXT:    beqz a3, .LBB13_6
+; RV32IZFH-NEXT:  # %bb.5:
+; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:  .LBB13_6:
+; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, a1
+; RV32IZFH-NEXT:    and a1, a4, a2
+; RV32IZFH-NEXT:    neg a2, s0
 ; RV32IZFH-NEXT:    and a0, a2, a0
+; RV32IZFH-NEXT:    neg a2, a3
+; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    and a0, a4, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    addi sp, sp, 16
-; RV32IZFH-NEXT:    beqz a3, .LBB13_6
-; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a4, a5, -1
-; RV32IZFH-NEXT:  .LBB13_6:
-; RV32IZFH-NEXT:    and a1, a2, a4
 ; RV32IZFH-NEXT:    ret
 ;
 ; RV64IZFH-LABEL: test_round_si64:
@@ -2327,16 +2313,16 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI13_1)
 ; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI13_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s2, a0
-; RV32IZHINX-NEXT:    flt.s a3, a2, s0
-; RV32IZHINX-NEXT:    neg a2, a3
+; RV32IZHINX-NEXT:    flt.s a4, a2, s0
+; RV32IZHINX-NEXT:    neg a2, a4
 ; RV32IZHINX-NEXT:    or a0, a2, a0
 ; RV32IZHINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    lui a5, 524288
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB13_4
 ; RV32IZHINX-NEXT:  # %bb.3:
-; RV32IZHINX-NEXT:    mv a4, a1
+; RV32IZHINX-NEXT:    mv a3, a1
 ; RV32IZHINX-NEXT:  .LBB13_4:
 ; RV32IZHINX-NEXT:    and a0, a2, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -2344,11 +2330,11 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    addi sp, sp, 16
-; RV32IZHINX-NEXT:    beqz a3, .LBB13_6
+; RV32IZHINX-NEXT:    beqz a4, .LBB13_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a4, a5, -1
+; RV32IZHINX-NEXT:    addi a3, a5, -1
 ; RV32IZHINX-NEXT:  .LBB13_6:
-; RV32IZHINX-NEXT:    and a1, a2, a4
+; RV32IZHINX-NEXT:    and a1, a2, a3
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: test_round_si64:
@@ -2386,41 +2372,39 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    addi sp, sp, -16
 ; RV32IZFHMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg s1, s0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI13_0)
-; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI13_0)(a2)
-; RV32IZFHMIN-NEXT:    and a0, s1, a0
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a2
-; RV32IZFHMIN-NEXT:    lui a5, 524288
 ; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB13_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
-; RV32IZFHMIN-NEXT:    mv a4, a1
+; RV32IZFHMIN-NEXT:    mv a2, a1
 ; RV32IZFHMIN-NEXT:  .LBB13_4:
+; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI13_0)
+; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI13_0)(a1)
+; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a3, .LBB13_6
+; RV32IZFHMIN-NEXT:  # %bb.5:
+; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:  .LBB13_6:
+; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, a1
+; RV32IZFHMIN-NEXT:    and a1, a4, a2
+; RV32IZFHMIN-NEXT:    neg a2, s0
 ; RV32IZFHMIN-NEXT:    and a0, a2, a0
+; RV32IZFHMIN-NEXT:    neg a2, a3
+; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    and a0, a4, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    addi sp, sp, 16
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB13_6
-; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a4, a5, -1
-; RV32IZFHMIN-NEXT:  .LBB13_6:
-; RV32IZFHMIN-NEXT:    and a1, a2, a4
 ; RV32IZFHMIN-NEXT:    ret
 ;
 ; RV64IZFHMIN-LABEL: test_round_si64:
@@ -2472,16 +2456,16 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI13_0)
 ; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI13_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s2, a0
-; RV32IZHINXMIN-NEXT:    flt.s a3, a2, s0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
+; RV32IZHINXMIN-NEXT:    flt.s a4, a2, s0
+; RV32IZHINXMIN-NEXT:    neg a2, a4
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    lui a5, 524288
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB13_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
-; RV32IZHINXMIN-NEXT:    mv a4, a1
+; RV32IZHINXMIN-NEXT:    mv a3, a1
 ; RV32IZHINXMIN-NEXT:  .LBB13_4:
 ; RV32IZHINXMIN-NEXT:    and a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -2489,11 +2473,11 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB13_6
+; RV32IZHINXMIN-NEXT:    beqz a4, .LBB13_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a4, a5, -1
+; RV32IZHINXMIN-NEXT:    addi a3, a5, -1
 ; RV32IZHINXMIN-NEXT:  .LBB13_6:
-; RV32IZHINXMIN-NEXT:    and a1, a2, a4
+; RV32IZHINXMIN-NEXT:    and a1, a2, a3
 ; RV32IZHINXMIN-NEXT:    ret
 ;
 ; RV64IZHINXMIN-LABEL: test_round_si64:
@@ -2972,40 +2956,38 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    addi sp, sp, -16
 ; RV32IZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    fcvt.s.h fs0, fa0
 ; RV32IZFH-NEXT:    lui a0, 913408
 ; RV32IZFH-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFH-NEXT:    neg s1, s0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a2, %hi(.LCPI17_1)
-; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI17_1)(a2)
-; RV32IZFH-NEXT:    and a0, s1, a0
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFH-NEXT:    neg a2, a2
-; RV32IZFH-NEXT:    lui a5, 524288
 ; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB17_4
 ; RV32IZFH-NEXT:  # %bb.3:
-; RV32IZFH-NEXT:    mv a4, a1
+; RV32IZFH-NEXT:    mv a2, a1
 ; RV32IZFH-NEXT:  .LBB17_4:
+; RV32IZFH-NEXT:    lui a1, %hi(.LCPI17_1)
+; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI17_1)(a1)
+; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFH-NEXT:    beqz a3, .LBB17_6
+; RV32IZFH-NEXT:  # %bb.5:
+; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:  .LBB17_6:
+; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, a1
+; RV32IZFH-NEXT:    and a1, a4, a2
+; RV32IZFH-NEXT:    neg a2, s0
 ; RV32IZFH-NEXT:    and a0, a2, a0
+; RV32IZFH-NEXT:    neg a2, a3
+; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    and a0, a4, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    addi sp, sp, 16
-; RV32IZFH-NEXT:    beqz a3, .LBB17_6
-; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a4, a5, -1
-; RV32IZFH-NEXT:  .LBB17_6:
-; RV32IZFH-NEXT:    and a1, a2, a4
 ; RV32IZFH-NEXT:    ret
 ;
 ; RV64IZFH-LABEL: test_roundeven_si64:
@@ -3043,16 +3025,16 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI17_1)
 ; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI17_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s2, a0
-; RV32IZHINX-NEXT:    flt.s a3, a2, s0
-; RV32IZHINX-NEXT:    neg a2, a3
+; RV32IZHINX-NEXT:    flt.s a4, a2, s0
+; RV32IZHINX-NEXT:    neg a2, a4
 ; RV32IZHINX-NEXT:    or a0, a2, a0
 ; RV32IZHINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    lui a5, 524288
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB17_4
 ; RV32IZHINX-NEXT:  # %bb.3:
-; RV32IZHINX-NEXT:    mv a4, a1
+; RV32IZHINX-NEXT:    mv a3, a1
 ; RV32IZHINX-NEXT:  .LBB17_4:
 ; RV32IZHINX-NEXT:    and a0, a2, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -3060,11 +3042,11 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    addi sp, sp, 16
-; RV32IZHINX-NEXT:    beqz a3, .LBB17_6
+; RV32IZHINX-NEXT:    beqz a4, .LBB17_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a4, a5, -1
+; RV32IZHINX-NEXT:    addi a3, a5, -1
 ; RV32IZHINX-NEXT:  .LBB17_6:
-; RV32IZHINX-NEXT:    and a1, a2, a4
+; RV32IZHINX-NEXT:    and a1, a2, a3
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: test_roundeven_si64:
@@ -3102,41 +3084,39 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    addi sp, sp, -16
 ; RV32IZFHMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg s1, s0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI17_0)
-; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI17_0)(a2)
-; RV32IZFHMIN-NEXT:    and a0, s1, a0
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a2
-; RV32IZFHMIN-NEXT:    lui a5, 524288
 ; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB17_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
-; RV32IZFHMIN-NEXT:    mv a4, a1
+; RV32IZFHMIN-NEXT:    mv a2, a1
 ; RV32IZFHMIN-NEXT:  .LBB17_4:
+; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI17_0)
+; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI17_0)(a1)
+; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a3, .LBB17_6
+; RV32IZFHMIN-NEXT:  # %bb.5:
+; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:  .LBB17_6:
+; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, a1
+; RV32IZFHMIN-NEXT:    and a1, a4, a2
+; RV32IZFHMIN-NEXT:    neg a2, s0
 ; RV32IZFHMIN-NEXT:    and a0, a2, a0
+; RV32IZFHMIN-NEXT:    neg a2, a3
+; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    and a0, a4, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    addi sp, sp, 16
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB17_6
-; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a4, a5, -1
-; RV32IZFHMIN-NEXT:  .LBB17_6:
-; RV32IZFHMIN-NEXT:    and a1, a2, a4
 ; RV32IZFHMIN-NEXT:    ret
 ;
 ; RV64IZFHMIN-LABEL: test_roundeven_si64:
@@ -3188,16 +3168,16 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI17_0)
 ; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI17_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s2, a0
-; RV32IZHINXMIN-NEXT:    flt.s a3, a2, s0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
+; RV32IZHINXMIN-NEXT:    flt.s a4, a2, s0
+; RV32IZHINXMIN-NEXT:    neg a2, a4
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    lui a5, 524288
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB17_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
-; RV32IZHINXMIN-NEXT:    mv a4, a1
+; RV32IZHINXMIN-NEXT:    mv a3, a1
 ; RV32IZHINXMIN-NEXT:  .LBB17_4:
 ; RV32IZHINXMIN-NEXT:    and a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -3205,11 +3185,11 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB17_6
+; RV32IZHINXMIN-NEXT:    beqz a4, .LBB17_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a4, a5, -1
+; RV32IZHINXMIN-NEXT:    addi a3, a5, -1
 ; RV32IZHINXMIN-NEXT:  .LBB17_6:
-; RV32IZHINXMIN-NEXT:    and a1, a2, a4
+; RV32IZHINXMIN-NEXT:    and a1, a2, a3
 ; RV32IZHINXMIN-NEXT:    ret
 ;
 ; RV64IZHINXMIN-LABEL: test_roundeven_si64:
@@ -3688,40 +3668,38 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    addi sp, sp, -16
 ; RV32IZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFH-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFH-NEXT:    fcvt.s.h fs0, fa0
 ; RV32IZFH-NEXT:    lui a0, 913408
 ; RV32IZFH-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFH-NEXT:    neg s1, s0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a2, %hi(.LCPI21_1)
-; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI21_1)(a2)
-; RV32IZFH-NEXT:    and a0, s1, a0
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFH-NEXT:    neg a2, a2
-; RV32IZFH-NEXT:    lui a5, 524288
 ; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB21_4
 ; RV32IZFH-NEXT:  # %bb.3:
-; RV32IZFH-NEXT:    mv a4, a1
+; RV32IZFH-NEXT:    mv a2, a1
 ; RV32IZFH-NEXT:  .LBB21_4:
+; RV32IZFH-NEXT:    lui a1, %hi(.LCPI21_1)
+; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI21_1)(a1)
+; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFH-NEXT:    beqz a3, .LBB21_6
+; RV32IZFH-NEXT:  # %bb.5:
+; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:  .LBB21_6:
+; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, a1
+; RV32IZFH-NEXT:    and a1, a4, a2
+; RV32IZFH-NEXT:    neg a2, s0
 ; RV32IZFH-NEXT:    and a0, a2, a0
+; RV32IZFH-NEXT:    neg a2, a3
+; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    and a0, a4, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFH-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    addi sp, sp, 16
-; RV32IZFH-NEXT:    beqz a3, .LBB21_6
-; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a4, a5, -1
-; RV32IZFH-NEXT:  .LBB21_6:
-; RV32IZFH-NEXT:    and a1, a2, a4
 ; RV32IZFH-NEXT:    ret
 ;
 ; RV64IZFH-LABEL: test_rint_si64:
@@ -3759,16 +3737,16 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lui a2, %hi(.LCPI21_1)
 ; RV32IZHINX-NEXT:    lw a2, %lo(.LCPI21_1)(a2)
 ; RV32IZHINX-NEXT:    and a0, s2, a0
-; RV32IZHINX-NEXT:    flt.s a3, a2, s0
-; RV32IZHINX-NEXT:    neg a2, a3
+; RV32IZHINX-NEXT:    flt.s a4, a2, s0
+; RV32IZHINX-NEXT:    neg a2, a4
 ; RV32IZHINX-NEXT:    or a0, a2, a0
 ; RV32IZHINX-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    lui a5, 524288
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB21_4
 ; RV32IZHINX-NEXT:  # %bb.3:
-; RV32IZHINX-NEXT:    mv a4, a1
+; RV32IZHINX-NEXT:    mv a3, a1
 ; RV32IZHINX-NEXT:  .LBB21_4:
 ; RV32IZHINX-NEXT:    and a0, a2, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -3776,11 +3754,11 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    addi sp, sp, 16
-; RV32IZHINX-NEXT:    beqz a3, .LBB21_6
+; RV32IZHINX-NEXT:    beqz a4, .LBB21_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a4, a5, -1
+; RV32IZHINX-NEXT:    addi a3, a5, -1
 ; RV32IZHINX-NEXT:  .LBB21_6:
-; RV32IZHINX-NEXT:    and a1, a2, a4
+; RV32IZHINX-NEXT:    and a1, a2, a3
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: test_rint_si64:
@@ -3818,41 +3796,39 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    addi sp, sp, -16
 ; RV32IZFHMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32IZFHMIN-NEXT:    fsw fs0, 0(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg s1, s0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI21_0)
-; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI21_0)(a2)
-; RV32IZFHMIN-NEXT:    and a0, s1, a0
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    feq.s a2, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a2, a2
-; RV32IZFHMIN-NEXT:    lui a5, 524288
 ; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB21_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
-; RV32IZFHMIN-NEXT:    mv a4, a1
+; RV32IZFHMIN-NEXT:    mv a2, a1
 ; RV32IZFHMIN-NEXT:  .LBB21_4:
+; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI21_0)
+; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI21_0)(a1)
+; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a3, .LBB21_6
+; RV32IZFHMIN-NEXT:  # %bb.5:
+; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:  .LBB21_6:
+; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, a1
+; RV32IZFHMIN-NEXT:    and a1, a4, a2
+; RV32IZFHMIN-NEXT:    neg a2, s0
 ; RV32IZFHMIN-NEXT:    and a0, a2, a0
+; RV32IZFHMIN-NEXT:    neg a2, a3
+; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    and a0, a4, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32IZFHMIN-NEXT:    flw fs0, 0(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    addi sp, sp, 16
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB21_6
-; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a4, a5, -1
-; RV32IZFHMIN-NEXT:  .LBB21_6:
-; RV32IZFHMIN-NEXT:    and a1, a2, a4
 ; RV32IZFHMIN-NEXT:    ret
 ;
 ; RV64IZFHMIN-LABEL: test_rint_si64:
@@ -3904,16 +3880,16 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lui a2, %hi(.LCPI21_0)
 ; RV32IZHINXMIN-NEXT:    lw a2, %lo(.LCPI21_0)(a2)
 ; RV32IZHINXMIN-NEXT:    and a0, s2, a0
-; RV32IZHINXMIN-NEXT:    flt.s a3, a2, s0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
+; RV32IZHINXMIN-NEXT:    flt.s a4, a2, s0
+; RV32IZHINXMIN-NEXT:    neg a2, a4
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    feq.s a2, s0, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    lui a5, 524288
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB21_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
-; RV32IZHINXMIN-NEXT:    mv a4, a1
+; RV32IZHINXMIN-NEXT:    mv a3, a1
 ; RV32IZHINXMIN-NEXT:  .LBB21_4:
 ; RV32IZHINXMIN-NEXT:    and a0, a2, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -3921,11 +3897,11 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB21_6
+; RV32IZHINXMIN-NEXT:    beqz a4, .LBB21_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a4, a5, -1
+; RV32IZHINXMIN-NEXT:    addi a3, a5, -1
 ; RV32IZHINXMIN-NEXT:  .LBB21_6:
-; RV32IZHINXMIN-NEXT:    and a1, a2, a4
+; RV32IZHINXMIN-NEXT:    and a1, a2, a3
 ; RV32IZHINXMIN-NEXT:    ret
 ;
 ; RV64IZHINXMIN-LABEL: test_rint_si64:
diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll
index 98c886333d69..a0c85ab4dca7 100644
--- a/llvm/test/CodeGen/RISCV/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/iabs.ll
@@ -630,8 +630,8 @@ define void @zext16_abs8(i8 %x, ptr %p) {
 ; RV32I-LABEL: zext16_abs8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    srai a2, a0, 31
+; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    xor a0, a0, a2
 ; RV32I-NEXT:    sub a0, a0, a2
 ; RV32I-NEXT:    sh a0, 0(a1)
@@ -648,8 +648,8 @@ define void @zext16_abs8(i8 %x, ptr %p) {
 ; RV64I-LABEL: zext16_abs8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    srai a2, a0, 63
+; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    xor a0, a0, a2
 ; RV64I-NEXT:    subw a0, a0, a2
 ; RV64I-NEXT:    sh a0, 0(a1)
diff --git a/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
new file mode 100644
index 000000000000..3fa494e1a57d
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv64 | FileCheck %s --check-prefixes=CHECK-NOZBB
+; RUN: llc < %s -mtriple=riscv64 -mattr=+zbb | FileCheck %s --check-prefixes=CHECK-ZBB
+
+; This test case miscompiled for ZBB (DAGCombiner turned a SELECT into a more
+; poisonous AND operation).
+define i1 @pr84653(i32 %x) {
+; CHECK-NOZBB-LABEL: pr84653:
+; CHECK-NOZBB:       # %bb.0:
+; CHECK-NOZBB-NEXT:    sext.w a1, a0
+; CHECK-NOZBB-NEXT:    sgtz a2, a1
+; CHECK-NOZBB-NEXT:    lui a3, 524288
+; CHECK-NOZBB-NEXT:    addi a3, a3, -1
+; CHECK-NOZBB-NEXT:    xor a0, a0, a3
+; CHECK-NOZBB-NEXT:    sext.w a0, a0
+; CHECK-NOZBB-NEXT:    slt a0, a0, a1
+; CHECK-NOZBB-NEXT:    and a0, a2, a0
+; CHECK-NOZBB-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: pr84653:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    sext.w a1, a0
+; CHECK-ZBB-NEXT:    lui a2, 524288
+; CHECK-ZBB-NEXT:    addi a2, a2, -1
+; CHECK-ZBB-NEXT:    xor a0, a0, a2
+; CHECK-ZBB-NEXT:    sext.w a0, a0
+; CHECK-ZBB-NEXT:    max a0, a0, zero
+; CHECK-ZBB-NEXT:    slt a0, a0, a1
+; CHECK-ZBB-NEXT:    ret
+  %cmp1 = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 2147483647, %x  ; 0x7fffffff
+  %cmp2 = icmp sgt i32 %x, %sub
+  %r = select i1 %cmp1, i1 %cmp2, i1 false
+  ret i1 %r
+}
+
+; This test case miscompiled for ZBB (DAGCombiner turned a SELECT into a more
+; poisonous AND operation).
+define i1 @pr85190(i64 %a) {
+; CHECK-NOZBB-LABEL: pr85190:
+; CHECK-NOZBB:       # %bb.0:
+; CHECK-NOZBB-NEXT:    ori a1, a0, 7
+; CHECK-NOZBB-NEXT:    slti a2, a0, 0
+; CHECK-NOZBB-NEXT:    li a3, -1
+; CHECK-NOZBB-NEXT:    slli a3, a3, 63
+; CHECK-NOZBB-NEXT:    sub a3, a3, a1
+; CHECK-NOZBB-NEXT:    slt a0, a0, a3
+; CHECK-NOZBB-NEXT:    and a0, a2, a0
+; CHECK-NOZBB-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: pr85190:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    ori a1, a0, 7
+; CHECK-ZBB-NEXT:    li a2, -1
+; CHECK-ZBB-NEXT:    slli a2, a2, 63
+; CHECK-ZBB-NEXT:    sub a2, a2, a1
+; CHECK-ZBB-NEXT:    slt a0, a0, a2
+; CHECK-ZBB-NEXT:    ret
+  %or = or i64 %a, 7
+  %cmp1 = icmp slt i64 %a, 0
+  %sub = sub nsw i64 -9223372036854775808, %or  ; 0x8000000000000000
+  %cmp2 = icmp sgt i64 %sub, %a
+  %res = select i1 %cmp1, i1 %cmp2, i1 false
+  ret i1 %res
+}
+
+define i1 @select_to_or(i32 %x) {
+; CHECK-NOZBB-LABEL: select_to_or:
+; CHECK-NOZBB:       # %bb.0:
+; CHECK-NOZBB-NEXT:    sext.w a1, a0
+; CHECK-NOZBB-NEXT:    sgtz a2, a1
+; CHECK-NOZBB-NEXT:    lui a3, 524288
+; CHECK-NOZBB-NEXT:    addi a3, a3, -1
+; CHECK-NOZBB-NEXT:    xor a0, a0, a3
+; CHECK-NOZBB-NEXT:    sext.w a0, a0
+; CHECK-NOZBB-NEXT:    slt a0, a0, a1
+; CHECK-NOZBB-NEXT:    or a0, a2, a0
+; CHECK-NOZBB-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: select_to_or:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    sext.w a1, a0
+; CHECK-ZBB-NEXT:    lui a2, 524288
+; CHECK-ZBB-NEXT:    addi a2, a2, -1
+; CHECK-ZBB-NEXT:    xor a0, a0, a2
+; CHECK-ZBB-NEXT:    sext.w a0, a0
+; CHECK-ZBB-NEXT:    min a0, a0, zero
+; CHECK-ZBB-NEXT:    slt a0, a0, a1
+; CHECK-ZBB-NEXT:    ret
+  %cmp1 = icmp sgt i32 %x, 0
+  %sub = sub nsw i32 2147483647, %x  ; 0x7fffffff
+  %cmp2 = icmp sgt i32 %x, %sub
+  %r = select i1 %cmp1, i1 true, i1 %cmp2
+  ret i1 %r
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/commutable.ll b/llvm/test/CodeGen/RISCV/rvv/commutable.ll
index b59df3b743cd..06a6327d3892 100644
--- a/llvm/test/CodeGen/RISCV/rvv/commutable.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/commutable.ll
@@ -649,3 +649,176 @@ entry:
   ret <vscale x 1 x i64> %ret
 }
 
+; vsadd.vv
+declare <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, iXLen);
+define <vscale x 1 x i64> @commutable_vsadd_vv(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
+; CHECK-LABEL: commutable_vsadd_vv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v10, v8, v9
+; CHECK-NEXT:    vsadd.vv v8, v9, v8
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vsadd.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %1, <vscale x 1 x i64> %0, iXLen %2)
+  %ret = add <vscale x 1 x i64> %a, %b
+  ret <vscale x 1 x i64> %ret
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, iXLen, iXLen);
+define <vscale x 1 x i64> @commutable_vsadd_vv_masked(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen %2) {
+; CHECK-LABEL: commutable_vsadd_vv_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vsadd.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen %2, iXLen 1)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vsadd.mask.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %1, <vscale x 1 x i64> %0, <vscale x 1 x i1> %mask, iXLen %2, iXLen 1)
+  %ret = add <vscale x 1 x i64> %a, %b
+  ret <vscale x 1 x i64> %ret
+}
+
+; vsaddu.vv
+declare <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, iXLen);
+define <vscale x 1 x i64> @commutable_vsaddu_vv(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
+; CHECK-LABEL: commutable_vsaddu_vv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v10, v8, v9
+; CHECK-NEXT:    vsaddu.vv v8, v9, v8
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vsaddu.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %1, <vscale x 1 x i64> %0, iXLen %2)
+  %ret = add <vscale x 1 x i64> %a, %b
+  ret <vscale x 1 x i64> %ret
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, iXLen, iXLen);
+define <vscale x 1 x i64> @commutable_vsaddu_vv_masked(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen %2) {
+; CHECK-LABEL: commutable_vsaddu_vv_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vsaddu.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen %2, iXLen 1)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vsaddu.mask.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %1, <vscale x 1 x i64> %0, <vscale x 1 x i1> %mask, iXLen %2, iXLen 1)
+  %ret = add <vscale x 1 x i64> %a, %b
+  ret <vscale x 1 x i64> %ret
+}
+
+; vaadd.vv
+declare <vscale x 1 x i64> @llvm.riscv.vaadd.nxv1i64.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, iXLen, iXLen);
+define <vscale x 1 x i64> @commutable_vaadd_vv(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
+; CHECK-LABEL: commutable_vaadd_vv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vaadd.vv v8, v8, v9
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vaadd.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen 0, iXLen %2)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vaadd.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %1, <vscale x 1 x i64> %0, iXLen 0, iXLen %2)
+  %ret = add <vscale x 1 x i64> %a, %b
+  ret <vscale x 1 x i64> %ret
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vaadd.mask.nxv1i64.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, iXLen, iXLen, iXLen);
+define <vscale x 1 x i64> @commutable_vaadd_vv_masked(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen %2) {
+; CHECK-LABEL: commutable_vaadd_vv_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vaadd.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vaadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x i64> @llvm.riscv.vaadd.mask.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen 0, iXLen %2, iXLen 1)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vaadd.mask.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %1, <vscale x 1 x i64> %0, <vscale x 1 x i1> %mask, iXLen 0, iXLen %2, iXLen 1)
+  %ret = add <vscale x 1 x i64> %a, %b
+  ret <vscale x 1 x i64> %ret
+}
+
+; vaaddu.vv
+declare <vscale x 1 x i64> @llvm.riscv.vaaddu.nxv1i64.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, iXLen, iXLen);
+define <vscale x 1 x i64> @commutable_vaaddu_vv(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
+; CHECK-LABEL: commutable_vaaddu_vv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vaaddu.vv v8, v8, v9
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vaaddu.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen 0, iXLen %2)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vaaddu.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %1, <vscale x 1 x i64> %0, iXLen 0, iXLen %2)
+  %ret = add <vscale x 1 x i64> %a, %b
+  ret <vscale x 1 x i64> %ret
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vaaddu.mask.nxv1i64.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, iXLen, iXLen, iXLen);
+define <vscale x 1 x i64> @commutable_vaaddu_vv_masked(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen %2) {
+; CHECK-LABEL: commutable_vaaddu_vv_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vaaddu.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vaaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x i64> @llvm.riscv.vaaddu.mask.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen 0, iXLen %2, iXLen 1)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vaaddu.mask.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %1, <vscale x 1 x i64> %0, <vscale x 1 x i1> %mask, iXLen 0, iXLen %2, iXLen 1)
+  %ret = add <vscale x 1 x i64> %a, %b
+  ret <vscale x 1 x i64> %ret
+}
+
+; vsmul.vv
+declare <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, iXLen, iXLen);
+define <vscale x 1 x i64> @commutable_vsmul_vv(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen %2) nounwind {
+; CHECK-LABEL: commutable_vsmul_vv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsmul.vv v10, v8, v9
+; CHECK-NEXT:    vsmul.vv v8, v9, v8
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %0, <vscale x 1 x i64> %1, iXLen 0, iXLen %2)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vsmul.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %1, <vscale x 1 x i64> %0, iXLen 0, iXLen %2)
+  %ret = add <vscale x 1 x i64> %a, %b
+  ret <vscale x 1 x i64> %ret
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, iXLen, iXLen, iXLen);
+define <vscale x 1 x i64> @commutable_vsmul_vv_masked(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen %2) {
+; CHECK-LABEL: commutable_vsmul_vv_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsmul.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vsmul.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v8
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %0, <vscale x 1 x i64> %1, <vscale x 1 x i1> %mask, iXLen 0, iXLen %2, iXLen 1)
+  %b = call <vscale x 1 x i64> @llvm.riscv.vsmul.mask.nxv1i64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %1, <vscale x 1 x i64> %0, <vscale x 1 x i1> %mask, iXLen 0, iXLen %2, iXLen 1)
+  %ret = add <vscale x 1 x i64> %a, %b
+  ret <vscale x 1 x i64> %ret
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index 83edd49bc963..1587f770f87c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -35,7 +35,7 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) {
 ; CHECK-NEXT:    vmv.v.v v0, v9
 ; CHECK-NEXT:    ret
   %vec = load <32 x i1>, ptr %p
-  %retval = call {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1> %vec)
+  %retval = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec)
   ret {<16 x i1>, <16 x i1>} %retval
 }
 
@@ -46,7 +46,7 @@ define {<16 x i8>, <16 x i8>} @vector_deinterleave_load_v16i8_v32i8(ptr %p) {
 ; CHECK-NEXT:    vlseg2e8.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <32 x i8>, ptr %p
-  %retval = call {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %vec)
+  %retval = call {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8> %vec)
   ret {<16 x i8>, <16 x i8>} %retval
 }
 
@@ -62,7 +62,7 @@ define {<8 x i16>, <8 x i16>} @vector_deinterleave_load_v8i16_v16i16_align1(ptr
 ; CHECK-NEXT:    vnsrl.wi v9, v10, 16
 ; CHECK-NEXT:    ret
   %vec = load <16 x i16>, ptr %p, align 1
-  %retval = call {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %vec)
+  %retval = call {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16> %vec)
   ret {<8 x i16>, <8 x i16>} %retval
 }
 
@@ -73,7 +73,7 @@ define {<8 x i16>, <8 x i16>} @vector_deinterleave_load_v8i16_v16i16(ptr %p) {
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <16 x i16>, ptr %p
-  %retval = call {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %vec)
+  %retval = call {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16> %vec)
   ret {<8 x i16>, <8 x i16>} %retval
 }
 
@@ -84,7 +84,7 @@ define {<4 x i32>, <4 x i32>} @vector_deinterleave_load_v4i32_vv8i32(ptr %p) {
 ; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <8 x i32>, ptr %p
-  %retval = call {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %vec)
+  %retval = call {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32> %vec)
   ret {<4 x i32>, <4 x i32>} %retval
 }
 
@@ -95,15 +95,15 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_load_v2i64_v4i64(ptr %p) {
 ; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <4 x i64>, ptr %p
-  %retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec)
+  %retval = call {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64> %vec)
   ret {<2 x i64>, <2 x i64>} %retval
 }
 
-declare {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1>)
-declare {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>)
-declare {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>)
-declare {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>)
-declare {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>)
+declare {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1>)
+declare {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8>)
+declare {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16>)
+declare {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32>)
+declare {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64>)
 
 ; Floats
 
@@ -114,7 +114,7 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_load_v2f16_v4f16(ptr %p) {
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <4 x half>, ptr %p
-  %retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec)
+  %retval = call {<2 x half>, <2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half> %vec)
   ret {<2 x half>, <2 x half>} %retval
 }
 
@@ -125,7 +125,7 @@ define {<4 x half>, <4 x half>} @vector_deinterleave_load_v4f16_v8f16(ptr %p) {
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <8 x half>, ptr %p
-  %retval = call {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %vec)
+  %retval = call {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half> %vec)
   ret {<4 x half>, <4 x half>} %retval
 }
 
@@ -136,7 +136,7 @@ define {<2 x float>, <2 x float>} @vector_deinterleave_load_v2f32_v4f32(ptr %p)
 ; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <4 x float>, ptr %p
-  %retval = call {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float> %vec)
+  %retval = call {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float> %vec)
   ret {<2 x float>, <2 x float>} %retval
 }
 
@@ -147,7 +147,7 @@ define {<8 x half>, <8 x half>} @vector_deinterleave_load_v8f16_v16f16(ptr %p) {
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <16 x half>, ptr %p
-  %retval = call {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %vec)
+  %retval = call {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half> %vec)
   ret {<8 x half>, <8 x half>} %retval
 }
 
@@ -158,7 +158,7 @@ define {<4 x float>, <4 x float>} @vector_deinterleave_load_v4f32_v8f32(ptr %p)
 ; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <8 x float>, ptr %p
-  %retval = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %vec)
+  %retval = call {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float> %vec)
   ret  {<4 x float>, <4 x float>} %retval
 }
 
@@ -169,13 +169,13 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_load_v2f64_v4f64(ptr %p
 ; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <4 x double>, ptr %p
-  %retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec)
+  %retval = call {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double> %vec)
   ret {<2 x double>, <2 x double>} %retval
 }
 
-declare {<2 x half>,<2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>)
-declare {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>)
-declare {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float>)
-declare {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>)
-declare {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>)
-declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>)
+declare {<2 x half>,<2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half>)
+declare {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half>)
+declare {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float>)
+declare {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half>)
+declare {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float>)
+declare {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
index 9161cedd58e3..8de9cc25ae09 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
@@ -23,7 +23,7 @@ define void @vector_interleave_store_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b, ptr
 ; CHECK-NEXT:    vmsne.vi v8, v12, 0
 ; CHECK-NEXT:    vsm.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1> %a, <16 x i1> %b)
+  %res = call <32 x i1> @llvm.vector.interleave2.v32i1(<16 x i1> %a, <16 x i1> %b)
   store <32 x i1> %res, ptr %p
   ret void
 }
@@ -40,7 +40,7 @@ define void @vector_interleave_store_v16i16_v8i16_align1(<8 x i16> %a, <8 x i16>
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vse8.v v10, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b)
+  %res = call <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b)
   store <16 x i16> %res, ptr %p, align 1
   ret void
 }
@@ -51,7 +51,7 @@ define void @vector_interleave_store_v16i16_v8i16(<8 x i16> %a, <8 x i16> %b, pt
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b)
+  %res = call <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b)
   store <16 x i16> %res, ptr %p
   ret void
 }
@@ -62,7 +62,7 @@ define void @vector_interleave_store_v8i32_v4i32(<4 x i32> %a, <4 x i32> %b, ptr
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b)
+  %res = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b)
   store <8 x i32> %res, ptr %p
   ret void
 }
@@ -73,15 +73,15 @@ define void @vector_interleave_store_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b, ptr
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %a, <2 x i64> %b)
+  %res = call <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64> %a, <2 x i64> %b)
   store <4 x i64> %res, ptr %p
   ret void
 }
 
-declare <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1>, <16 x i1>)
-declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
-declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
-declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
+declare <32 x i1> @llvm.vector.interleave2.v32i1(<16 x i1>, <16 x i1>)
+declare <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
+declare <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
 
 ; Floats
 
@@ -91,7 +91,7 @@ define void @vector_interleave_store_v4f16_v2f16(<2 x half> %a, <2 x half> %b, p
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %a, <2 x half> %b)
+  %res = call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %a, <2 x half> %b)
   store <4 x half> %res, ptr %p
   ret void
 }
@@ -102,7 +102,7 @@ define void @vector_interleave_store_v8f16_v4f16(<4 x half> %a, <4 x half> %b, p
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %a, <4 x half> %b)
+  %res = call <8 x half> @llvm.vector.interleave2.v8f16(<4 x half> %a, <4 x half> %b)
   store <8 x half> %res, ptr %p
   ret void
 }
@@ -113,7 +113,7 @@ define void @vector_interleave_store_v4f32_v2f32(<2 x float> %a, <2 x float> %b,
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %a, <2 x float> %b)
+  %res = call <4 x float> @llvm.vector.interleave2.v4f32(<2 x float> %a, <2 x float> %b)
   store <4 x float> %res, ptr %p
   ret void
 }
@@ -124,7 +124,7 @@ define void @vector_interleave_store_v16f16_v8f16(<8 x half> %a, <8 x half> %b,
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b)
+  %res = call <16 x half> @llvm.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b)
   store <16 x half> %res, ptr %p
   ret void
 }
@@ -135,7 +135,7 @@ define void @vector_interleave_store_v8f32_v4f32(<4 x float> %a, <4 x float> %b,
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
+  %res = call <8 x float> @llvm.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
   store <8 x float> %res, ptr %p
   ret void
 }
@@ -146,15 +146,15 @@ define void @vector_interleave_store_v4f64_v2f64(<2 x double> %a, <2 x double> %
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b)
+  %res = call <4 x double> @llvm.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b)
   store <4 x double> %res, ptr %p
   ret void
 }
 
 
-declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>)
-declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>)
-declare <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float>, <2 x float>)
-declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>)
-declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>)
-declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>)
+declare <4 x half> @llvm.vector.interleave2.v4f16(<2 x half>, <2 x half>)
+declare <8 x half> @llvm.vector.interleave2.v8f16(<4 x half>, <4 x half>)
+declare <4 x float> @llvm.vector.interleave2.v4f32(<2 x float>, <2 x float>)
+declare <16 x half> @llvm.vector.interleave2.v16f16(<8 x half>, <8 x half>)
+declare <8 x float> @llvm.vector.interleave2.v8f32(<4 x float>, <4 x float>)
+declare <4 x double> @llvm.vector.interleave2.v4f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse-bitrotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse-bitrotate.ll
index d4c0477408fd..a81f740f1739 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse-bitrotate.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse-bitrotate.ll
@@ -16,8 +16,8 @@ define <256 x i1> @reverse_v256i1(<256 x i1> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v10
 ; CHECK-NEXT:    vmsne.vi v0, v12, 0
 ; CHECK-NEXT:    ret
-  %res = call <256 x i1> @llvm.experimental.vector.reverse.v256i1(<256 x i1> %a)
+  %res = call <256 x i1> @llvm.vector.reverse.v256i1(<256 x i1> %a)
   ret <256 x i1> %res
 }
 
-declare <256 x i1> @llvm.experimental.vector.reverse.v256i1(<256 x i1>)
+declare <256 x i1> @llvm.vector.reverse.v256i1(<256 x i1>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
index 8f9f1c2729fc..47d7baade8b4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
@@ -29,7 +29,7 @@ define <2 x i1> @reverse_v2i1(<2 x i1> %a) {
 ; ZVBB-NEXT:    vbrev.v v8, v0
 ; ZVBB-NEXT:    vsrl.vi v0, v8, 6
 ; ZVBB-NEXT:    ret
-  %res = call <2 x i1> @llvm.experimental.vector.reverse.v2i1(<2 x i1> %a)
+  %res = call <2 x i1> @llvm.vector.reverse.v2i1(<2 x i1> %a)
   ret <2 x i1> %res
 }
 
@@ -51,7 +51,7 @@ define <4 x i1> @reverse_v4i1(<4 x i1> %a) {
 ; ZVBB-NEXT:    vbrev.v v8, v0
 ; ZVBB-NEXT:    vsrl.vi v0, v8, 4
 ; ZVBB-NEXT:    ret
-  %res = call <4 x i1> @llvm.experimental.vector.reverse.v4i1(<4 x i1> %a)
+  %res = call <4 x i1> @llvm.vector.reverse.v4i1(<4 x i1> %a)
   ret <4 x i1> %res
 }
 
@@ -72,7 +72,7 @@ define <8 x i1> @reverse_v8i1(<8 x i1> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; ZVBB-NEXT:    vbrev.v v0, v0
 ; ZVBB-NEXT:    ret
-  %res = call <8 x i1> @llvm.experimental.vector.reverse.v8i1(<8 x i1> %a)
+  %res = call <8 x i1> @llvm.vector.reverse.v8i1(<8 x i1> %a)
   ret <8 x i1> %res
 }
 
@@ -93,7 +93,7 @@ define <16 x i1> @reverse_v16i1(<16 x i1> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; ZVBB-NEXT:    vbrev.v v0, v0
 ; ZVBB-NEXT:    ret
-  %res = call <16 x i1> @llvm.experimental.vector.reverse.v16i1(<16 x i1> %a)
+  %res = call <16 x i1> @llvm.vector.reverse.v16i1(<16 x i1> %a)
   ret <16 x i1> %res
 }
 
@@ -116,7 +116,7 @@ define <32 x i1> @reverse_v32i1(<32 x i1> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vbrev.v v0, v0
 ; ZVBB-NEXT:    ret
-  %res = call <32 x i1> @llvm.experimental.vector.reverse.v32i1(<32 x i1> %a)
+  %res = call <32 x i1> @llvm.vector.reverse.v32i1(<32 x i1> %a)
   ret <32 x i1> %res
 }
 
@@ -139,7 +139,7 @@ define <64 x i1> @reverse_v64i1(<64 x i1> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; ZVBB-NEXT:    vbrev.v v0, v0
 ; ZVBB-NEXT:    ret
-  %res = call <64 x i1> @llvm.experimental.vector.reverse.v64i1(<64 x i1> %a)
+  %res = call <64 x i1> @llvm.vector.reverse.v64i1(<64 x i1> %a)
   ret <64 x i1> %res
 }
 
@@ -156,7 +156,7 @@ define <128 x i1> @reverse_v128i1(<128 x i1> %a) {
 ; CHECK-NEXT:    vrgather.vv v24, v16, v8
 ; CHECK-NEXT:    vmsne.vi v0, v24, 0
 ; CHECK-NEXT:    ret
-  %res = call <128 x i1> @llvm.experimental.vector.reverse.v128i1(<128 x i1> %a)
+  %res = call <128 x i1> @llvm.vector.reverse.v128i1(<128 x i1> %a)
   ret <128 x i1> %res
 }
 
@@ -164,7 +164,7 @@ define <1 x i8> @reverse_v1i8(<1 x i8> %a) {
 ; CHECK-LABEL: reverse_v1i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <1 x i8> @llvm.experimental.vector.reverse.v1i8(<1 x i8> %a)
+  %res = call <1 x i8> @llvm.vector.reverse.v1i8(<1 x i8> %a)
   ret <1 x i8> %res
 }
 
@@ -182,7 +182,7 @@ define <2 x i8> @reverse_v2i8(<2 x i8> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; ZVBB-NEXT:    vrev8.v v8, v8
 ; ZVBB-NEXT:    ret
-  %res = call <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8> %a)
+  %res = call <2 x i8> @llvm.vector.reverse.v2i8(<2 x i8> %a)
   ret <2 x i8> %res
 }
 
@@ -195,7 +195,7 @@ define <4 x i8> @reverse_v4i8(<4 x i8> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <4 x i8> @llvm.experimental.vector.reverse.v4i8(<4 x i8> %a)
+  %res = call <4 x i8> @llvm.vector.reverse.v4i8(<4 x i8> %a)
   ret <4 x i8> %res
 }
 
@@ -208,7 +208,7 @@ define <8 x i8> @reverse_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <8 x i8> @llvm.experimental.vector.reverse.v8i8(<8 x i8> %a)
+  %res = call <8 x i8> @llvm.vector.reverse.v8i8(<8 x i8> %a)
   ret <8 x i8> %res
 }
 
@@ -221,7 +221,7 @@ define <16 x i8> @reverse_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> %a)
+  %res = call <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8> %a)
   ret <16 x i8> %res
 }
 
@@ -236,7 +236,7 @@ define <32 x i8> @reverse_v32i8(<32 x i8> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <32 x i8> @llvm.experimental.vector.reverse.v32i8(<32 x i8> %a)
+  %res = call <32 x i8> @llvm.vector.reverse.v32i8(<32 x i8> %a)
   ret <32 x i8> %res
 }
 
@@ -251,7 +251,7 @@ define <64 x i8> @reverse_v64i8(<64 x i8> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <64 x i8> @llvm.experimental.vector.reverse.v64i8(<64 x i8> %a)
+  %res = call <64 x i8> @llvm.vector.reverse.v64i8(<64 x i8> %a)
   ret <64 x i8> %res
 }
 
@@ -259,7 +259,7 @@ define <1 x i16> @reverse_v1i16(<1 x i16> %a) {
 ; CHECK-LABEL: reverse_v1i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <1 x i16> @llvm.experimental.vector.reverse.v1i16(<1 x i16> %a)
+  %res = call <1 x i16> @llvm.vector.reverse.v1i16(<1 x i16> %a)
   ret <1 x i16> %res
 }
 
@@ -277,7 +277,7 @@ define <2 x i16> @reverse_v2i16(<2 x i16> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vror.vi v8, v8, 16
 ; ZVBB-NEXT:    ret
-  %res = call <2 x i16> @llvm.experimental.vector.reverse.v2i16(<2 x i16> %a)
+  %res = call <2 x i16> @llvm.vector.reverse.v2i16(<2 x i16> %a)
   ret <2 x i16> %res
 }
 
@@ -290,7 +290,7 @@ define <4 x i16> @reverse_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <4 x i16> @llvm.experimental.vector.reverse.v4i16(<4 x i16> %a)
+  %res = call <4 x i16> @llvm.vector.reverse.v4i16(<4 x i16> %a)
   ret <4 x i16> %res
 }
 
@@ -303,7 +303,7 @@ define <8 x i16> @reverse_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16> %a)
+  %res = call <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16> %a)
   ret <8 x i16> %res
 }
 
@@ -316,7 +316,7 @@ define <16 x i16> @reverse_v16i16(<16 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <16 x i16> @llvm.experimental.vector.reverse.v16i16(<16 x i16> %a)
+  %res = call <16 x i16> @llvm.vector.reverse.v16i16(<16 x i16> %a)
   ret <16 x i16> %res
 }
 
@@ -332,7 +332,7 @@ define <32 x i16> @reverse_v32i16(<32 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <32 x i16> @llvm.experimental.vector.reverse.v32i16(<32 x i16> %a)
+  %res = call <32 x i16> @llvm.vector.reverse.v32i16(<32 x i16> %a)
   ret <32 x i16> %res
 }
 
@@ -340,7 +340,7 @@ define <1 x i32> @reverse_v1i32(<1 x i32> %a) {
 ; CHECK-LABEL: reverse_v1i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <1 x i32> @llvm.experimental.vector.reverse.v1i32(<1 x i32> %a)
+  %res = call <1 x i32> @llvm.vector.reverse.v1i32(<1 x i32> %a)
   ret <1 x i32> %res
 }
 
@@ -358,7 +358,7 @@ define <2 x i32> @reverse_v2i32(<2 x i32> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; ZVBB-NEXT:    vror.vi v8, v8, 32
 ; ZVBB-NEXT:    ret
-  %res = call <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32> %a)
+  %res = call <2 x i32> @llvm.vector.reverse.v2i32(<2 x i32> %a)
   ret <2 x i32> %res
 }
 
@@ -371,7 +371,7 @@ define <4 x i32> @reverse_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32> %a)
+  %res = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> %a)
   ret <4 x i32> %res
 }
 
@@ -385,7 +385,7 @@ define <8 x i32> @reverse_v8i32(<8 x i32> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32> %a)
+  %res = call <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32> %a)
   ret <8 x i32> %res
 }
 
@@ -399,7 +399,7 @@ define <16 x i32> @reverse_v16i32(<16 x i32> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <16 x i32> @llvm.experimental.vector.reverse.v16i32(<16 x i32> %a)
+  %res = call <16 x i32> @llvm.vector.reverse.v16i32(<16 x i32> %a)
   ret <16 x i32> %res
 }
 
@@ -407,7 +407,7 @@ define <1 x i64> @reverse_v1i64(<1 x i64> %a) {
 ; CHECK-LABEL: reverse_v1i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <1 x i64> @llvm.experimental.vector.reverse.v1i64(<1 x i64> %a)
+  %res = call <1 x i64> @llvm.vector.reverse.v1i64(<1 x i64> %a)
   ret <1 x i64> %res
 }
 
@@ -419,7 +419,7 @@ define <2 x i64> @reverse_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    vslideup.vi v9, v8, 1
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64> %a)
+  %res = call <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64> %a)
   ret <2 x i64> %res
 }
 
@@ -433,7 +433,7 @@ define <4 x i64> @reverse_v4i64(<4 x i64> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <4 x i64> @llvm.experimental.vector.reverse.v4i64(<4 x i64> %a)
+  %res = call <4 x i64> @llvm.vector.reverse.v4i64(<4 x i64> %a)
   ret <4 x i64> %res
 }
 
@@ -447,7 +447,7 @@ define <8 x i64> @reverse_v8i64(<8 x i64> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <8 x i64> @llvm.experimental.vector.reverse.v8i64(<8 x i64> %a)
+  %res = call <8 x i64> @llvm.vector.reverse.v8i64(<8 x i64> %a)
   ret <8 x i64> %res
 }
 
@@ -456,7 +456,7 @@ define <1 x half> @reverse_v1f16(<1 x half> %a) {
 ; CHECK-LABEL: reverse_v1f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <1 x half> @llvm.experimental.vector.reverse.v1f16(<1 x half> %a)
+  %res = call <1 x half> @llvm.vector.reverse.v1f16(<1 x half> %a)
   ret <1 x half> %res
 }
 
@@ -474,7 +474,7 @@ define <2 x half> @reverse_v2f16(<2 x half> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vror.vi v8, v8, 16
 ; ZVBB-NEXT:    ret
-  %res = call <2 x half> @llvm.experimental.vector.reverse.v2f16(<2 x half> %a)
+  %res = call <2 x half> @llvm.vector.reverse.v2f16(<2 x half> %a)
   ret <2 x half> %res
 }
 
@@ -487,7 +487,7 @@ define <4 x half> @reverse_v4f16(<4 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <4 x half> @llvm.experimental.vector.reverse.v4f16(<4 x half> %a)
+  %res = call <4 x half> @llvm.vector.reverse.v4f16(<4 x half> %a)
   ret <4 x half> %res
 }
 
@@ -500,7 +500,7 @@ define <8 x half> @reverse_v8f16(<8 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half> %a)
+  %res = call <8 x half> @llvm.vector.reverse.v8f16(<8 x half> %a)
   ret <8 x half> %res
 }
 
@@ -513,7 +513,7 @@ define <16 x half> @reverse_v16f16(<16 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <16 x half> @llvm.experimental.vector.reverse.v16f16(<16 x half> %a)
+  %res = call <16 x half> @llvm.vector.reverse.v16f16(<16 x half> %a)
   ret <16 x half> %res
 }
 
@@ -529,7 +529,7 @@ define <32 x half> @reverse_v32f16(<32 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <32 x half> @llvm.experimental.vector.reverse.v32f16(<32 x half> %a)
+  %res = call <32 x half> @llvm.vector.reverse.v32f16(<32 x half> %a)
   ret <32 x half> %res
 }
 
@@ -537,7 +537,7 @@ define <1 x float> @reverse_v1f32(<1 x float> %a) {
 ; CHECK-LABEL: reverse_v1f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <1 x float> @llvm.experimental.vector.reverse.v1f32(<1 x float> %a)
+  %res = call <1 x float> @llvm.vector.reverse.v1f32(<1 x float> %a)
   ret <1 x float> %res
 }
 
@@ -555,7 +555,7 @@ define <2 x float> @reverse_v2f32(<2 x float> %a) {
 ; ZVBB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; ZVBB-NEXT:    vror.vi v8, v8, 32
 ; ZVBB-NEXT:    ret
-  %res = call <2 x float> @llvm.experimental.vector.reverse.v2f32(<2 x float> %a)
+  %res = call <2 x float> @llvm.vector.reverse.v2f32(<2 x float> %a)
   ret <2 x float> %res
 }
 
@@ -568,7 +568,7 @@ define <4 x float> @reverse_v4f32(<4 x float> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float> %a)
+  %res = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> %a)
   ret <4 x float> %res
 }
 
@@ -582,7 +582,7 @@ define <8 x float> @reverse_v8f32(<8 x float> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <8 x float> @llvm.experimental.vector.reverse.v8f32(<8 x float> %a)
+  %res = call <8 x float> @llvm.vector.reverse.v8f32(<8 x float> %a)
   ret <8 x float> %res
 }
 
@@ -596,7 +596,7 @@ define <16 x float> @reverse_v16f32(<16 x float> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float> %a)
+  %res = call <16 x float> @llvm.vector.reverse.v16f32(<16 x float> %a)
   ret <16 x float> %res
 }
 
@@ -604,7 +604,7 @@ define <1 x double> @reverse_v1f64(<1 x double> %a) {
 ; CHECK-LABEL: reverse_v1f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <1 x double> @llvm.experimental.vector.reverse.v1f64(<1 x double> %a)
+  %res = call <1 x double> @llvm.vector.reverse.v1f64(<1 x double> %a)
   ret <1 x double> %res
 }
 
@@ -616,7 +616,7 @@ define <2 x double> @reverse_v2f64(<2 x double> %a) {
 ; CHECK-NEXT:    vslideup.vi v9, v8, 1
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double> %a)
+  %res = call <2 x double> @llvm.vector.reverse.v2f64(<2 x double> %a)
   ret <2 x double> %res
 }
 
@@ -630,7 +630,7 @@ define <4 x double> @reverse_v4f64(<4 x double> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <4 x double> @llvm.experimental.vector.reverse.v4f64(<4 x double> %a)
+  %res = call <4 x double> @llvm.vector.reverse.v4f64(<4 x double> %a)
   ret <4 x double> %res
 }
 
@@ -644,7 +644,7 @@ define <8 x double> @reverse_v8f64(<8 x double> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <8 x double> @llvm.experimental.vector.reverse.v8f64(<8 x double> %a)
+  %res = call <8 x double> @llvm.vector.reverse.v8f64(<8 x double> %a)
   ret <8 x double> %res
 }
 
@@ -729,7 +729,7 @@ define <3 x i64> @reverse_v3i64(<3 x i64> %a) {
 ; RV64-ZVBB-NEXT:    vrgatherei16.vv v10, v8, v12
 ; RV64-ZVBB-NEXT:    vmv.v.v v8, v10
 ; RV64-ZVBB-NEXT:    ret
-  %res = call <3 x i64> @llvm.experimental.vector.reverse.v3i64(<3 x i64> %a)
+  %res = call <3 x i64> @llvm.vector.reverse.v3i64(<3 x i64> %a)
   ret <3 x i64> %res
 }
 
@@ -813,7 +813,7 @@ define <6 x i64> @reverse_v6i64(<6 x i64> %a) {
 ; RV64-ZVBB-NEXT:    vrgatherei16.vv v12, v8, v16
 ; RV64-ZVBB-NEXT:    vmv.v.v v8, v12
 ; RV64-ZVBB-NEXT:    ret
-  %res = call <6 x i64> @llvm.experimental.vector.reverse.v6i64(<6 x i64> %a)
+  %res = call <6 x i64> @llvm.vector.reverse.v6i64(<6 x i64> %a)
   ret <6 x i64> %res
 }
 
@@ -901,54 +901,54 @@ define <12 x i64> @reverse_v12i64(<12 x i64> %a) {
 ; RV64-ZVBB-NEXT:    vrgatherei16.vv v16, v8, v24
 ; RV64-ZVBB-NEXT:    vmv.v.v v8, v16
 ; RV64-ZVBB-NEXT:    ret
-  %res = call <12 x i64> @llvm.experimental.vector.reverse.v12i64(<12 x i64> %a)
+  %res = call <12 x i64> @llvm.vector.reverse.v12i64(<12 x i64> %a)
   ret <12 x i64> %res
 }
 
-declare <2 x i1> @llvm.experimental.vector.reverse.v2i1(<2 x i1>)
-declare <4 x i1> @llvm.experimental.vector.reverse.v4i1(<4 x i1>)
-declare <8 x i1> @llvm.experimental.vector.reverse.v8i1(<8 x i1>)
-declare <16 x i1> @llvm.experimental.vector.reverse.v16i1(<16 x i1>)
-declare <32 x i1> @llvm.experimental.vector.reverse.v32i1(<32 x i1>)
-declare <64 x i1> @llvm.experimental.vector.reverse.v64i1(<64 x i1>)
-declare <128 x i1> @llvm.experimental.vector.reverse.v128i1(<128 x i1>)
-declare <1 x i8> @llvm.experimental.vector.reverse.v1i8(<1 x i8>)
-declare <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8>)
-declare <4 x i8> @llvm.experimental.vector.reverse.v4i8(<4 x i8>)
-declare <8 x i8> @llvm.experimental.vector.reverse.v8i8(<8 x i8>)
-declare <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8>)
-declare <32 x i8> @llvm.experimental.vector.reverse.v32i8(<32 x i8>)
-declare <64 x i8> @llvm.experimental.vector.reverse.v64i8(<64 x i8>)
-declare <1 x i16> @llvm.experimental.vector.reverse.v1i16(<1 x i16>)
-declare <2 x i16> @llvm.experimental.vector.reverse.v2i16(<2 x i16>)
-declare <4 x i16> @llvm.experimental.vector.reverse.v4i16(<4 x i16>)
-declare <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16>)
-declare <16 x i16> @llvm.experimental.vector.reverse.v16i16(<16 x i16>)
-declare <32 x i16> @llvm.experimental.vector.reverse.v32i16(<32 x i16>)
-declare <1 x i32> @llvm.experimental.vector.reverse.v1i32(<1 x i32>)
-declare <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32>)
-declare <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32>)
-declare <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32>)
-declare <16 x i32> @llvm.experimental.vector.reverse.v16i32(<16 x i32>)
-declare <1 x i64> @llvm.experimental.vector.reverse.v1i64(<1 x i64>)
-declare <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64>)
-declare <4 x i64> @llvm.experimental.vector.reverse.v4i64(<4 x i64>)
-declare <8 x i64> @llvm.experimental.vector.reverse.v8i64(<8 x i64>)
-declare <1 x half> @llvm.experimental.vector.reverse.v1f16(<1 x half>)
-declare <2 x half> @llvm.experimental.vector.reverse.v2f16(<2 x half>)
-declare <4 x half> @llvm.experimental.vector.reverse.v4f16(<4 x half>)
-declare <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half>)
-declare <16 x half> @llvm.experimental.vector.reverse.v16f16(<16 x half>)
-declare <32 x half> @llvm.experimental.vector.reverse.v32f16(<32 x half>)
-declare <1 x float> @llvm.experimental.vector.reverse.v1f32(<1 x float>)
-declare <2 x float> @llvm.experimental.vector.reverse.v2f32(<2 x float>)
-declare <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float>)
-declare <8 x float> @llvm.experimental.vector.reverse.v8f32(<8 x float>)
-declare <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float>)
-declare <1 x double> @llvm.experimental.vector.reverse.v1f64(<1 x double>)
-declare <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double>)
-declare <4 x double> @llvm.experimental.vector.reverse.v4f64(<4 x double>)
-declare <8 x double> @llvm.experimental.vector.reverse.v8f64(<8 x double>)
-declare <3 x i64> @llvm.experimental.vector.reverse.v3i64(<3 x i64>)
-declare <6 x i64> @llvm.experimental.vector.reverse.v6i64(<6 x i64>)
-declare <12 x i64> @llvm.experimental.vector.reverse.v12i64(<12 x i64>)
+declare <2 x i1> @llvm.vector.reverse.v2i1(<2 x i1>)
+declare <4 x i1> @llvm.vector.reverse.v4i1(<4 x i1>)
+declare <8 x i1> @llvm.vector.reverse.v8i1(<8 x i1>)
+declare <16 x i1> @llvm.vector.reverse.v16i1(<16 x i1>)
+declare <32 x i1> @llvm.vector.reverse.v32i1(<32 x i1>)
+declare <64 x i1> @llvm.vector.reverse.v64i1(<64 x i1>)
+declare <128 x i1> @llvm.vector.reverse.v128i1(<128 x i1>)
+declare <1 x i8> @llvm.vector.reverse.v1i8(<1 x i8>)
+declare <2 x i8> @llvm.vector.reverse.v2i8(<2 x i8>)
+declare <4 x i8> @llvm.vector.reverse.v4i8(<4 x i8>)
+declare <8 x i8> @llvm.vector.reverse.v8i8(<8 x i8>)
+declare <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8>)
+declare <32 x i8> @llvm.vector.reverse.v32i8(<32 x i8>)
+declare <64 x i8> @llvm.vector.reverse.v64i8(<64 x i8>)
+declare <1 x i16> @llvm.vector.reverse.v1i16(<1 x i16>)
+declare <2 x i16> @llvm.vector.reverse.v2i16(<2 x i16>)
+declare <4 x i16> @llvm.vector.reverse.v4i16(<4 x i16>)
+declare <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16>)
+declare <16 x i16> @llvm.vector.reverse.v16i16(<16 x i16>)
+declare <32 x i16> @llvm.vector.reverse.v32i16(<32 x i16>)
+declare <1 x i32> @llvm.vector.reverse.v1i32(<1 x i32>)
+declare <2 x i32> @llvm.vector.reverse.v2i32(<2 x i32>)
+declare <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32>)
+declare <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32>)
+declare <16 x i32> @llvm.vector.reverse.v16i32(<16 x i32>)
+declare <1 x i64> @llvm.vector.reverse.v1i64(<1 x i64>)
+declare <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64>)
+declare <4 x i64> @llvm.vector.reverse.v4i64(<4 x i64>)
+declare <8 x i64> @llvm.vector.reverse.v8i64(<8 x i64>)
+declare <1 x half> @llvm.vector.reverse.v1f16(<1 x half>)
+declare <2 x half> @llvm.vector.reverse.v2f16(<2 x half>)
+declare <4 x half> @llvm.vector.reverse.v4f16(<4 x half>)
+declare <8 x half> @llvm.vector.reverse.v8f16(<8 x half>)
+declare <16 x half> @llvm.vector.reverse.v16f16(<16 x half>)
+declare <32 x half> @llvm.vector.reverse.v32f16(<32 x half>)
+declare <1 x float> @llvm.vector.reverse.v1f32(<1 x float>)
+declare <2 x float> @llvm.vector.reverse.v2f32(<2 x float>)
+declare <4 x float> @llvm.vector.reverse.v4f32(<4 x float>)
+declare <8 x float> @llvm.vector.reverse.v8f32(<8 x float>)
+declare <16 x float> @llvm.vector.reverse.v16f32(<16 x float>)
+declare <1 x double> @llvm.vector.reverse.v1f64(<1 x double>)
+declare <2 x double> @llvm.vector.reverse.v2f64(<2 x double>)
+declare <4 x double> @llvm.vector.reverse.v4f64(<4 x double>)
+declare <8 x double> @llvm.vector.reverse.v8f64(<8 x double>)
+declare <3 x i64> @llvm.vector.reverse.v3i64(<3 x i64>)
+declare <6 x i64> @llvm.vector.reverse.v6i64(<6 x i64>)
+declare <12 x i64> @llvm.vector.reverse.v12i64(<12 x i64>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index b3bda5973eb8..a6b2d3141f22 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -2190,65 +2190,66 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset s0, -16
 ; CHECK-NOV-NEXT:    .cfi_offset s1, -24
 ; CHECK-NOV-NEXT:    .cfi_offset fs0, -32
-; CHECK-NOV-NEXT:    fmv.d fs0, fa0
-; CHECK-NOV-NEXT:    fmv.d fa0, fa1
+; CHECK-NOV-NEXT:    fmv.d fs0, fa1
 ; CHECK-NOV-NEXT:    call __fixdfti
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    mv s1, a1
 ; CHECK-NOV-NEXT:    fmv.d fa0, fs0
 ; CHECK-NOV-NEXT:    call __fixdfti
-; CHECK-NOV-NEXT:    li a2, -1
-; CHECK-NOV-NEXT:    srli a3, a2, 1
-; CHECK-NOV-NEXT:    beqz s1, .LBB18_3
+; CHECK-NOV-NEXT:    mv a2, a0
+; CHECK-NOV-NEXT:    li a0, -1
+; CHECK-NOV-NEXT:    srli a3, a0, 1
+; CHECK-NOV-NEXT:    beqz a1, .LBB18_3
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    slti a4, s1, 0
-; CHECK-NOV-NEXT:    bnez a1, .LBB18_4
+; CHECK-NOV-NEXT:    slti a4, a1, 0
+; CHECK-NOV-NEXT:    bnez s1, .LBB18_4
 ; CHECK-NOV-NEXT:  .LBB18_2:
-; CHECK-NOV-NEXT:    sltu a5, a0, a3
+; CHECK-NOV-NEXT:    sltu a5, s0, a3
 ; CHECK-NOV-NEXT:    beqz a5, .LBB18_5
 ; CHECK-NOV-NEXT:    j .LBB18_6
 ; CHECK-NOV-NEXT:  .LBB18_3:
-; CHECK-NOV-NEXT:    sltu a4, s0, a3
-; CHECK-NOV-NEXT:    beqz a1, .LBB18_2
+; CHECK-NOV-NEXT:    sltu a4, a2, a3
+; CHECK-NOV-NEXT:    beqz s1, .LBB18_2
 ; CHECK-NOV-NEXT:  .LBB18_4: # %entry
-; CHECK-NOV-NEXT:    slti a5, a1, 0
+; CHECK-NOV-NEXT:    slti a5, s1, 0
 ; CHECK-NOV-NEXT:    bnez a5, .LBB18_6
 ; CHECK-NOV-NEXT:  .LBB18_5: # %entry
-; CHECK-NOV-NEXT:    mv a0, a3
+; CHECK-NOV-NEXT:    mv s0, a3
 ; CHECK-NOV-NEXT:  .LBB18_6: # %entry
 ; CHECK-NOV-NEXT:    neg a6, a5
 ; CHECK-NOV-NEXT:    neg a5, a4
-; CHECK-NOV-NEXT:    and a5, a5, s1
+; CHECK-NOV-NEXT:    and a5, a5, a1
 ; CHECK-NOV-NEXT:    bnez a4, .LBB18_8
 ; CHECK-NOV-NEXT:  # %bb.7: # %entry
-; CHECK-NOV-NEXT:    mv s0, a3
+; CHECK-NOV-NEXT:    mv a2, a3
 ; CHECK-NOV-NEXT:  .LBB18_8: # %entry
-; CHECK-NOV-NEXT:    and a4, a6, a1
-; CHECK-NOV-NEXT:    slli a1, a2, 63
-; CHECK-NOV-NEXT:    beq a5, a2, .LBB18_11
+; CHECK-NOV-NEXT:    and a4, a6, s1
+; CHECK-NOV-NEXT:    slli a1, a0, 63
+; CHECK-NOV-NEXT:    beq a5, a0, .LBB18_11
 ; CHECK-NOV-NEXT:  # %bb.9: # %entry
 ; CHECK-NOV-NEXT:    slti a3, a5, 0
 ; CHECK-NOV-NEXT:    xori a3, a3, 1
-; CHECK-NOV-NEXT:    bne a4, a2, .LBB18_12
+; CHECK-NOV-NEXT:    bne a4, a0, .LBB18_12
 ; CHECK-NOV-NEXT:  .LBB18_10:
-; CHECK-NOV-NEXT:    sltu a2, a1, a0
-; CHECK-NOV-NEXT:    beqz a2, .LBB18_13
+; CHECK-NOV-NEXT:    sltu a0, a1, s0
+; CHECK-NOV-NEXT:    beqz a0, .LBB18_13
 ; CHECK-NOV-NEXT:    j .LBB18_14
 ; CHECK-NOV-NEXT:  .LBB18_11:
-; CHECK-NOV-NEXT:    sltu a3, a1, s0
-; CHECK-NOV-NEXT:    beq a4, a2, .LBB18_10
+; CHECK-NOV-NEXT:    sltu a3, a1, a2
+; CHECK-NOV-NEXT:    beq a4, a0, .LBB18_10
 ; CHECK-NOV-NEXT:  .LBB18_12: # %entry
-; CHECK-NOV-NEXT:    slti a2, a4, 0
-; CHECK-NOV-NEXT:    xori a2, a2, 1
-; CHECK-NOV-NEXT:    bnez a2, .LBB18_14
+; CHECK-NOV-NEXT:    slti a0, a4, 0
+; CHECK-NOV-NEXT:    xori a0, a0, 1
+; CHECK-NOV-NEXT:    bnez a0, .LBB18_14
 ; CHECK-NOV-NEXT:  .LBB18_13: # %entry
-; CHECK-NOV-NEXT:    mv a0, a1
+; CHECK-NOV-NEXT:    mv s0, a1
 ; CHECK-NOV-NEXT:  .LBB18_14: # %entry
 ; CHECK-NOV-NEXT:    bnez a3, .LBB18_16
 ; CHECK-NOV-NEXT:  # %bb.15: # %entry
-; CHECK-NOV-NEXT:    mv s0, a1
+; CHECK-NOV-NEXT:    mv a2, a1
 ; CHECK-NOV-NEXT:  .LBB18_16: # %entry
-; CHECK-NOV-NEXT:    mv a1, s0
+; CHECK-NOV-NEXT:    mv a0, s0
+; CHECK-NOV-NEXT:    mv a1, a2
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -2273,43 +2274,43 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    vfmv.f.s fa0, v8
+; CHECK-V-NEXT:    vslidedown.vi v9, v8, 1
+; CHECK-V-NEXT:    vfmv.f.s fa0, v9
 ; CHECK-V-NEXT:    call __fixdfti
 ; CHECK-V-NEXT:    mv s0, a0
 ; CHECK-V-NEXT:    mv s1, a1
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixdfti
 ; CHECK-V-NEXT:    li a2, -1
 ; CHECK-V-NEXT:    srli a3, a2, 1
-; CHECK-V-NEXT:    beqz s1, .LBB18_3
+; CHECK-V-NEXT:    beqz a1, .LBB18_3
 ; CHECK-V-NEXT:  # %bb.1: # %entry
-; CHECK-V-NEXT:    slti a4, s1, 0
-; CHECK-V-NEXT:    bnez a1, .LBB18_4
+; CHECK-V-NEXT:    slti a4, a1, 0
+; CHECK-V-NEXT:    bnez s1, .LBB18_4
 ; CHECK-V-NEXT:  .LBB18_2:
-; CHECK-V-NEXT:    sltu a5, a0, a3
+; CHECK-V-NEXT:    sltu a5, s0, a3
 ; CHECK-V-NEXT:    beqz a5, .LBB18_5
 ; CHECK-V-NEXT:    j .LBB18_6
 ; CHECK-V-NEXT:  .LBB18_3:
-; CHECK-V-NEXT:    sltu a4, s0, a3
-; CHECK-V-NEXT:    beqz a1, .LBB18_2
+; CHECK-V-NEXT:    sltu a4, a0, a3
+; CHECK-V-NEXT:    beqz s1, .LBB18_2
 ; CHECK-V-NEXT:  .LBB18_4: # %entry
-; CHECK-V-NEXT:    slti a5, a1, 0
+; CHECK-V-NEXT:    slti a5, s1, 0
 ; CHECK-V-NEXT:    bnez a5, .LBB18_6
 ; CHECK-V-NEXT:  .LBB18_5: # %entry
-; CHECK-V-NEXT:    mv a0, a3
+; CHECK-V-NEXT:    mv s0, a3
 ; CHECK-V-NEXT:  .LBB18_6: # %entry
 ; CHECK-V-NEXT:    neg a6, a5
 ; CHECK-V-NEXT:    neg a5, a4
-; CHECK-V-NEXT:    and a5, a5, s1
+; CHECK-V-NEXT:    and a5, a5, a1
 ; CHECK-V-NEXT:    bnez a4, .LBB18_8
 ; CHECK-V-NEXT:  # %bb.7: # %entry
-; CHECK-V-NEXT:    mv s0, a3
+; CHECK-V-NEXT:    mv a0, a3
 ; CHECK-V-NEXT:  .LBB18_8: # %entry
-; CHECK-V-NEXT:    and a4, a6, a1
+; CHECK-V-NEXT:    and a4, a6, s1
 ; CHECK-V-NEXT:    slli a1, a2, 63
 ; CHECK-V-NEXT:    beq a5, a2, .LBB18_11
 ; CHECK-V-NEXT:  # %bb.9: # %entry
@@ -2317,26 +2318,26 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:    xori a3, a3, 1
 ; CHECK-V-NEXT:    bne a4, a2, .LBB18_12
 ; CHECK-V-NEXT:  .LBB18_10:
-; CHECK-V-NEXT:    sltu a2, a1, a0
+; CHECK-V-NEXT:    sltu a2, a1, s0
 ; CHECK-V-NEXT:    beqz a2, .LBB18_13
 ; CHECK-V-NEXT:    j .LBB18_14
 ; CHECK-V-NEXT:  .LBB18_11:
-; CHECK-V-NEXT:    sltu a3, a1, s0
+; CHECK-V-NEXT:    sltu a3, a1, a0
 ; CHECK-V-NEXT:    beq a4, a2, .LBB18_10
 ; CHECK-V-NEXT:  .LBB18_12: # %entry
 ; CHECK-V-NEXT:    slti a2, a4, 0
 ; CHECK-V-NEXT:    xori a2, a2, 1
 ; CHECK-V-NEXT:    bnez a2, .LBB18_14
 ; CHECK-V-NEXT:  .LBB18_13: # %entry
-; CHECK-V-NEXT:    mv a0, a1
+; CHECK-V-NEXT:    mv s0, a1
 ; CHECK-V-NEXT:  .LBB18_14: # %entry
 ; CHECK-V-NEXT:    bnez a3, .LBB18_16
 ; CHECK-V-NEXT:  # %bb.15: # %entry
-; CHECK-V-NEXT:    mv s0, a1
+; CHECK-V-NEXT:    mv a0, a1
 ; CHECK-V-NEXT:  .LBB18_16: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v8, s0
-; CHECK-V-NEXT:    vmv.s.x v9, a0
+; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    vmv.s.x v9, s0
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 1
@@ -2369,19 +2370,19 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset s0, -16
 ; CHECK-NOV-NEXT:    .cfi_offset s1, -24
 ; CHECK-NOV-NEXT:    .cfi_offset fs0, -32
-; CHECK-NOV-NEXT:    fmv.d fs0, fa0
-; CHECK-NOV-NEXT:    fmv.d fa0, fa1
+; CHECK-NOV-NEXT:    fmv.d fs0, fa1
 ; CHECK-NOV-NEXT:    call __fixunsdfti
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    mv s1, a1
 ; CHECK-NOV-NEXT:    fmv.d fa0, fs0
 ; CHECK-NOV-NEXT:    call __fixunsdfti
-; CHECK-NOV-NEXT:    snez a2, s1
 ; CHECK-NOV-NEXT:    snez a1, a1
+; CHECK-NOV-NEXT:    snez a2, s1
+; CHECK-NOV-NEXT:    addi a2, a2, -1
+; CHECK-NOV-NEXT:    and a2, a2, s0
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
-; CHECK-NOV-NEXT:    and a0, a1, a0
-; CHECK-NOV-NEXT:    addi a1, a2, -1
-; CHECK-NOV-NEXT:    and a1, a1, s0
+; CHECK-NOV-NEXT:    and a1, a1, a0
+; CHECK-NOV-NEXT:    mv a0, a2
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -2406,25 +2407,25 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    vfmv.f.s fa0, v8
+; CHECK-V-NEXT:    vslidedown.vi v9, v8, 1
+; CHECK-V-NEXT:    vfmv.f.s fa0, v9
 ; CHECK-V-NEXT:    call __fixunsdfti
 ; CHECK-V-NEXT:    mv s0, a0
 ; CHECK-V-NEXT:    mv s1, a1
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixunsdfti
-; CHECK-V-NEXT:    snez a2, s1
 ; CHECK-V-NEXT:    snez a1, a1
-; CHECK-V-NEXT:    addi a1, a1, -1
-; CHECK-V-NEXT:    and a0, a1, a0
+; CHECK-V-NEXT:    snez a2, s1
 ; CHECK-V-NEXT:    addi a2, a2, -1
 ; CHECK-V-NEXT:    and a2, a2, s0
+; CHECK-V-NEXT:    addi a1, a1, -1
+; CHECK-V-NEXT:    and a0, a1, a0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v8, a2
-; CHECK-V-NEXT:    vmv.s.x v9, a0
+; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    vmv.s.x v9, a2
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 1
@@ -2466,32 +2467,32 @@ define <2 x i64> @ustest_f64i64(<2 x double> %x) {
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    li a2, 1
 ; CHECK-NOV-NEXT:  .LBB20_2: # %entry
-; CHECK-NOV-NEXT:    slti a4, s1, 1
 ; CHECK-NOV-NEXT:    slti a3, a1, 1
+; CHECK-NOV-NEXT:    slti a4, s1, 1
 ; CHECK-NOV-NEXT:    blez a1, .LBB20_4
 ; CHECK-NOV-NEXT:  # %bb.3: # %entry
 ; CHECK-NOV-NEXT:    li a1, 1
 ; CHECK-NOV-NEXT:  .LBB20_4: # %entry
+; CHECK-NOV-NEXT:    neg a4, a4
 ; CHECK-NOV-NEXT:    neg a3, a3
 ; CHECK-NOV-NEXT:    and a3, a3, a0
-; CHECK-NOV-NEXT:    neg a0, a4
 ; CHECK-NOV-NEXT:    beqz a1, .LBB20_7
 ; CHECK-NOV-NEXT:  # %bb.5: # %entry
 ; CHECK-NOV-NEXT:    sgtz a1, a1
-; CHECK-NOV-NEXT:    and a0, a0, s0
+; CHECK-NOV-NEXT:    and a4, a4, s0
 ; CHECK-NOV-NEXT:    bnez a2, .LBB20_8
 ; CHECK-NOV-NEXT:  .LBB20_6:
-; CHECK-NOV-NEXT:    snez a2, a0
+; CHECK-NOV-NEXT:    snez a0, a4
 ; CHECK-NOV-NEXT:    j .LBB20_9
 ; CHECK-NOV-NEXT:  .LBB20_7:
 ; CHECK-NOV-NEXT:    snez a1, a3
-; CHECK-NOV-NEXT:    and a0, a0, s0
+; CHECK-NOV-NEXT:    and a4, a4, s0
 ; CHECK-NOV-NEXT:    beqz a2, .LBB20_6
 ; CHECK-NOV-NEXT:  .LBB20_8: # %entry
-; CHECK-NOV-NEXT:    sgtz a2, a2
+; CHECK-NOV-NEXT:    sgtz a0, a2
 ; CHECK-NOV-NEXT:  .LBB20_9: # %entry
-; CHECK-NOV-NEXT:    neg a2, a2
-; CHECK-NOV-NEXT:    and a0, a2, a0
+; CHECK-NOV-NEXT:    neg a0, a0
+; CHECK-NOV-NEXT:    and a0, a0, a4
 ; CHECK-NOV-NEXT:    neg a1, a1
 ; CHECK-NOV-NEXT:    and a1, a1, a3
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -2533,15 +2534,15 @@ define <2 x i64> @ustest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:  # %bb.1: # %entry
 ; CHECK-V-NEXT:    li a2, 1
 ; CHECK-V-NEXT:  .LBB20_2: # %entry
-; CHECK-V-NEXT:    slti a3, s1, 1
 ; CHECK-V-NEXT:    slti a4, a1, 1
+; CHECK-V-NEXT:    slti a3, s1, 1
 ; CHECK-V-NEXT:    blez a1, .LBB20_4
 ; CHECK-V-NEXT:  # %bb.3: # %entry
 ; CHECK-V-NEXT:    li a1, 1
 ; CHECK-V-NEXT:  .LBB20_4: # %entry
+; CHECK-V-NEXT:    neg a3, a3
 ; CHECK-V-NEXT:    neg a4, a4
 ; CHECK-V-NEXT:    and a0, a4, a0
-; CHECK-V-NEXT:    neg a3, a3
 ; CHECK-V-NEXT:    beqz a1, .LBB20_7
 ; CHECK-V-NEXT:  # %bb.5: # %entry
 ; CHECK-V-NEXT:    sgtz a1, a1
@@ -2596,65 +2597,66 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset s0, -16
 ; CHECK-NOV-NEXT:    .cfi_offset s1, -24
 ; CHECK-NOV-NEXT:    .cfi_offset fs0, -32
-; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.s fa0, fa1
+; CHECK-NOV-NEXT:    fmv.s fs0, fa1
 ; CHECK-NOV-NEXT:    call __fixsfti
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    mv s1, a1
 ; CHECK-NOV-NEXT:    fmv.s fa0, fs0
 ; CHECK-NOV-NEXT:    call __fixsfti
-; CHECK-NOV-NEXT:    li a2, -1
-; CHECK-NOV-NEXT:    srli a3, a2, 1
-; CHECK-NOV-NEXT:    beqz s1, .LBB21_3
+; CHECK-NOV-NEXT:    mv a2, a0
+; CHECK-NOV-NEXT:    li a0, -1
+; CHECK-NOV-NEXT:    srli a3, a0, 1
+; CHECK-NOV-NEXT:    beqz a1, .LBB21_3
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    slti a4, s1, 0
-; CHECK-NOV-NEXT:    bnez a1, .LBB21_4
+; CHECK-NOV-NEXT:    slti a4, a1, 0
+; CHECK-NOV-NEXT:    bnez s1, .LBB21_4
 ; CHECK-NOV-NEXT:  .LBB21_2:
-; CHECK-NOV-NEXT:    sltu a5, a0, a3
+; CHECK-NOV-NEXT:    sltu a5, s0, a3
 ; CHECK-NOV-NEXT:    beqz a5, .LBB21_5
 ; CHECK-NOV-NEXT:    j .LBB21_6
 ; CHECK-NOV-NEXT:  .LBB21_3:
-; CHECK-NOV-NEXT:    sltu a4, s0, a3
-; CHECK-NOV-NEXT:    beqz a1, .LBB21_2
+; CHECK-NOV-NEXT:    sltu a4, a2, a3
+; CHECK-NOV-NEXT:    beqz s1, .LBB21_2
 ; CHECK-NOV-NEXT:  .LBB21_4: # %entry
-; CHECK-NOV-NEXT:    slti a5, a1, 0
+; CHECK-NOV-NEXT:    slti a5, s1, 0
 ; CHECK-NOV-NEXT:    bnez a5, .LBB21_6
 ; CHECK-NOV-NEXT:  .LBB21_5: # %entry
-; CHECK-NOV-NEXT:    mv a0, a3
+; CHECK-NOV-NEXT:    mv s0, a3
 ; CHECK-NOV-NEXT:  .LBB21_6: # %entry
 ; CHECK-NOV-NEXT:    neg a6, a5
 ; CHECK-NOV-NEXT:    neg a5, a4
-; CHECK-NOV-NEXT:    and a5, a5, s1
+; CHECK-NOV-NEXT:    and a5, a5, a1
 ; CHECK-NOV-NEXT:    bnez a4, .LBB21_8
 ; CHECK-NOV-NEXT:  # %bb.7: # %entry
-; CHECK-NOV-NEXT:    mv s0, a3
+; CHECK-NOV-NEXT:    mv a2, a3
 ; CHECK-NOV-NEXT:  .LBB21_8: # %entry
-; CHECK-NOV-NEXT:    and a4, a6, a1
-; CHECK-NOV-NEXT:    slli a1, a2, 63
-; CHECK-NOV-NEXT:    beq a5, a2, .LBB21_11
+; CHECK-NOV-NEXT:    and a4, a6, s1
+; CHECK-NOV-NEXT:    slli a1, a0, 63
+; CHECK-NOV-NEXT:    beq a5, a0, .LBB21_11
 ; CHECK-NOV-NEXT:  # %bb.9: # %entry
 ; CHECK-NOV-NEXT:    slti a3, a5, 0
 ; CHECK-NOV-NEXT:    xori a3, a3, 1
-; CHECK-NOV-NEXT:    bne a4, a2, .LBB21_12
+; CHECK-NOV-NEXT:    bne a4, a0, .LBB21_12
 ; CHECK-NOV-NEXT:  .LBB21_10:
-; CHECK-NOV-NEXT:    sltu a2, a1, a0
-; CHECK-NOV-NEXT:    beqz a2, .LBB21_13
+; CHECK-NOV-NEXT:    sltu a0, a1, s0
+; CHECK-NOV-NEXT:    beqz a0, .LBB21_13
 ; CHECK-NOV-NEXT:    j .LBB21_14
 ; CHECK-NOV-NEXT:  .LBB21_11:
-; CHECK-NOV-NEXT:    sltu a3, a1, s0
-; CHECK-NOV-NEXT:    beq a4, a2, .LBB21_10
+; CHECK-NOV-NEXT:    sltu a3, a1, a2
+; CHECK-NOV-NEXT:    beq a4, a0, .LBB21_10
 ; CHECK-NOV-NEXT:  .LBB21_12: # %entry
-; CHECK-NOV-NEXT:    slti a2, a4, 0
-; CHECK-NOV-NEXT:    xori a2, a2, 1
-; CHECK-NOV-NEXT:    bnez a2, .LBB21_14
+; CHECK-NOV-NEXT:    slti a0, a4, 0
+; CHECK-NOV-NEXT:    xori a0, a0, 1
+; CHECK-NOV-NEXT:    bnez a0, .LBB21_14
 ; CHECK-NOV-NEXT:  .LBB21_13: # %entry
-; CHECK-NOV-NEXT:    mv a0, a1
+; CHECK-NOV-NEXT:    mv s0, a1
 ; CHECK-NOV-NEXT:  .LBB21_14: # %entry
 ; CHECK-NOV-NEXT:    bnez a3, .LBB21_16
 ; CHECK-NOV-NEXT:  # %bb.15: # %entry
-; CHECK-NOV-NEXT:    mv s0, a1
+; CHECK-NOV-NEXT:    mv a2, a1
 ; CHECK-NOV-NEXT:  .LBB21_16: # %entry
-; CHECK-NOV-NEXT:    mv a1, s0
+; CHECK-NOV-NEXT:    mv a0, s0
+; CHECK-NOV-NEXT:    mv a1, a2
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -2679,43 +2681,43 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vfmv.f.s fa0, v8
+; CHECK-V-NEXT:    vslidedown.vi v9, v8, 1
+; CHECK-V-NEXT:    vfmv.f.s fa0, v9
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    mv s0, a0
 ; CHECK-V-NEXT:    mv s1, a1
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    li a2, -1
 ; CHECK-V-NEXT:    srli a3, a2, 1
-; CHECK-V-NEXT:    beqz s1, .LBB21_3
+; CHECK-V-NEXT:    beqz a1, .LBB21_3
 ; CHECK-V-NEXT:  # %bb.1: # %entry
-; CHECK-V-NEXT:    slti a4, s1, 0
-; CHECK-V-NEXT:    bnez a1, .LBB21_4
+; CHECK-V-NEXT:    slti a4, a1, 0
+; CHECK-V-NEXT:    bnez s1, .LBB21_4
 ; CHECK-V-NEXT:  .LBB21_2:
-; CHECK-V-NEXT:    sltu a5, a0, a3
+; CHECK-V-NEXT:    sltu a5, s0, a3
 ; CHECK-V-NEXT:    beqz a5, .LBB21_5
 ; CHECK-V-NEXT:    j .LBB21_6
 ; CHECK-V-NEXT:  .LBB21_3:
-; CHECK-V-NEXT:    sltu a4, s0, a3
-; CHECK-V-NEXT:    beqz a1, .LBB21_2
+; CHECK-V-NEXT:    sltu a4, a0, a3
+; CHECK-V-NEXT:    beqz s1, .LBB21_2
 ; CHECK-V-NEXT:  .LBB21_4: # %entry
-; CHECK-V-NEXT:    slti a5, a1, 0
+; CHECK-V-NEXT:    slti a5, s1, 0
 ; CHECK-V-NEXT:    bnez a5, .LBB21_6
 ; CHECK-V-NEXT:  .LBB21_5: # %entry
-; CHECK-V-NEXT:    mv a0, a3
+; CHECK-V-NEXT:    mv s0, a3
 ; CHECK-V-NEXT:  .LBB21_6: # %entry
 ; CHECK-V-NEXT:    neg a6, a5
 ; CHECK-V-NEXT:    neg a5, a4
-; CHECK-V-NEXT:    and a5, a5, s1
+; CHECK-V-NEXT:    and a5, a5, a1
 ; CHECK-V-NEXT:    bnez a4, .LBB21_8
 ; CHECK-V-NEXT:  # %bb.7: # %entry
-; CHECK-V-NEXT:    mv s0, a3
+; CHECK-V-NEXT:    mv a0, a3
 ; CHECK-V-NEXT:  .LBB21_8: # %entry
-; CHECK-V-NEXT:    and a4, a6, a1
+; CHECK-V-NEXT:    and a4, a6, s1
 ; CHECK-V-NEXT:    slli a1, a2, 63
 ; CHECK-V-NEXT:    beq a5, a2, .LBB21_11
 ; CHECK-V-NEXT:  # %bb.9: # %entry
@@ -2723,26 +2725,26 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    xori a3, a3, 1
 ; CHECK-V-NEXT:    bne a4, a2, .LBB21_12
 ; CHECK-V-NEXT:  .LBB21_10:
-; CHECK-V-NEXT:    sltu a2, a1, a0
+; CHECK-V-NEXT:    sltu a2, a1, s0
 ; CHECK-V-NEXT:    beqz a2, .LBB21_13
 ; CHECK-V-NEXT:    j .LBB21_14
 ; CHECK-V-NEXT:  .LBB21_11:
-; CHECK-V-NEXT:    sltu a3, a1, s0
+; CHECK-V-NEXT:    sltu a3, a1, a0
 ; CHECK-V-NEXT:    beq a4, a2, .LBB21_10
 ; CHECK-V-NEXT:  .LBB21_12: # %entry
 ; CHECK-V-NEXT:    slti a2, a4, 0
 ; CHECK-V-NEXT:    xori a2, a2, 1
 ; CHECK-V-NEXT:    bnez a2, .LBB21_14
 ; CHECK-V-NEXT:  .LBB21_13: # %entry
-; CHECK-V-NEXT:    mv a0, a1
+; CHECK-V-NEXT:    mv s0, a1
 ; CHECK-V-NEXT:  .LBB21_14: # %entry
 ; CHECK-V-NEXT:    bnez a3, .LBB21_16
 ; CHECK-V-NEXT:  # %bb.15: # %entry
-; CHECK-V-NEXT:    mv s0, a1
+; CHECK-V-NEXT:    mv a0, a1
 ; CHECK-V-NEXT:  .LBB21_16: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v8, s0
-; CHECK-V-NEXT:    vmv.s.x v9, a0
+; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    vmv.s.x v9, s0
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 1
@@ -2775,19 +2777,19 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset s0, -16
 ; CHECK-NOV-NEXT:    .cfi_offset s1, -24
 ; CHECK-NOV-NEXT:    .cfi_offset fs0, -32
-; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fmv.s fa0, fa1
+; CHECK-NOV-NEXT:    fmv.s fs0, fa1
 ; CHECK-NOV-NEXT:    call __fixunssfti
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    mv s1, a1
 ; CHECK-NOV-NEXT:    fmv.s fa0, fs0
 ; CHECK-NOV-NEXT:    call __fixunssfti
-; CHECK-NOV-NEXT:    snez a2, s1
 ; CHECK-NOV-NEXT:    snez a1, a1
+; CHECK-NOV-NEXT:    snez a2, s1
+; CHECK-NOV-NEXT:    addi a2, a2, -1
+; CHECK-NOV-NEXT:    and a2, a2, s0
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
-; CHECK-NOV-NEXT:    and a0, a1, a0
-; CHECK-NOV-NEXT:    addi a1, a2, -1
-; CHECK-NOV-NEXT:    and a1, a1, s0
+; CHECK-NOV-NEXT:    and a1, a1, a0
+; CHECK-NOV-NEXT:    mv a0, a2
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -2812,25 +2814,25 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vfmv.f.s fa0, v8
+; CHECK-V-NEXT:    vslidedown.vi v9, v8, 1
+; CHECK-V-NEXT:    vfmv.f.s fa0, v9
 ; CHECK-V-NEXT:    call __fixunssfti
 ; CHECK-V-NEXT:    mv s0, a0
 ; CHECK-V-NEXT:    mv s1, a1
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; CHECK-V-NEXT:    addi a0, sp, 32
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixunssfti
-; CHECK-V-NEXT:    snez a2, s1
 ; CHECK-V-NEXT:    snez a1, a1
-; CHECK-V-NEXT:    addi a1, a1, -1
-; CHECK-V-NEXT:    and a0, a1, a0
+; CHECK-V-NEXT:    snez a2, s1
 ; CHECK-V-NEXT:    addi a2, a2, -1
 ; CHECK-V-NEXT:    and a2, a2, s0
+; CHECK-V-NEXT:    addi a1, a1, -1
+; CHECK-V-NEXT:    and a0, a1, a0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v8, a2
-; CHECK-V-NEXT:    vmv.s.x v9, a0
+; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    vmv.s.x v9, a2
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 1
@@ -2872,32 +2874,32 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) {
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    li a2, 1
 ; CHECK-NOV-NEXT:  .LBB23_2: # %entry
-; CHECK-NOV-NEXT:    slti a4, s1, 1
 ; CHECK-NOV-NEXT:    slti a3, a1, 1
+; CHECK-NOV-NEXT:    slti a4, s1, 1
 ; CHECK-NOV-NEXT:    blez a1, .LBB23_4
 ; CHECK-NOV-NEXT:  # %bb.3: # %entry
 ; CHECK-NOV-NEXT:    li a1, 1
 ; CHECK-NOV-NEXT:  .LBB23_4: # %entry
+; CHECK-NOV-NEXT:    neg a4, a4
 ; CHECK-NOV-NEXT:    neg a3, a3
 ; CHECK-NOV-NEXT:    and a3, a3, a0
-; CHECK-NOV-NEXT:    neg a0, a4
 ; CHECK-NOV-NEXT:    beqz a1, .LBB23_7
 ; CHECK-NOV-NEXT:  # %bb.5: # %entry
 ; CHECK-NOV-NEXT:    sgtz a1, a1
-; CHECK-NOV-NEXT:    and a0, a0, s0
+; CHECK-NOV-NEXT:    and a4, a4, s0
 ; CHECK-NOV-NEXT:    bnez a2, .LBB23_8
 ; CHECK-NOV-NEXT:  .LBB23_6:
-; CHECK-NOV-NEXT:    snez a2, a0
+; CHECK-NOV-NEXT:    snez a0, a4
 ; CHECK-NOV-NEXT:    j .LBB23_9
 ; CHECK-NOV-NEXT:  .LBB23_7:
 ; CHECK-NOV-NEXT:    snez a1, a3
-; CHECK-NOV-NEXT:    and a0, a0, s0
+; CHECK-NOV-NEXT:    and a4, a4, s0
 ; CHECK-NOV-NEXT:    beqz a2, .LBB23_6
 ; CHECK-NOV-NEXT:  .LBB23_8: # %entry
-; CHECK-NOV-NEXT:    sgtz a2, a2
+; CHECK-NOV-NEXT:    sgtz a0, a2
 ; CHECK-NOV-NEXT:  .LBB23_9: # %entry
-; CHECK-NOV-NEXT:    neg a2, a2
-; CHECK-NOV-NEXT:    and a0, a2, a0
+; CHECK-NOV-NEXT:    neg a0, a0
+; CHECK-NOV-NEXT:    and a0, a0, a4
 ; CHECK-NOV-NEXT:    neg a1, a1
 ; CHECK-NOV-NEXT:    and a1, a1, a3
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -2939,15 +2941,15 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:  # %bb.1: # %entry
 ; CHECK-V-NEXT:    li a2, 1
 ; CHECK-V-NEXT:  .LBB23_2: # %entry
-; CHECK-V-NEXT:    slti a3, s1, 1
 ; CHECK-V-NEXT:    slti a4, a1, 1
+; CHECK-V-NEXT:    slti a3, s1, 1
 ; CHECK-V-NEXT:    blez a1, .LBB23_4
 ; CHECK-V-NEXT:  # %bb.3: # %entry
 ; CHECK-V-NEXT:    li a1, 1
 ; CHECK-V-NEXT:  .LBB23_4: # %entry
+; CHECK-V-NEXT:    neg a3, a3
 ; CHECK-V-NEXT:    neg a4, a4
 ; CHECK-V-NEXT:    and a0, a4, a0
-; CHECK-V-NEXT:    neg a3, a3
 ; CHECK-V-NEXT:    beqz a1, .LBB23_7
 ; CHECK-V-NEXT:  # %bb.5: # %entry
 ; CHECK-V-NEXT:    sgtz a1, a1
@@ -3002,8 +3004,8 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset s0, -16
 ; CHECK-NOV-NEXT:    .cfi_offset s1, -24
 ; CHECK-NOV-NEXT:    .cfi_offset s2, -32
-; CHECK-NOV-NEXT:    mv s2, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    mv s2, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    call __fixsfti
 ; CHECK-NOV-NEXT:    mv s0, a0
@@ -3011,58 +3013,60 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    call __fixsfti
-; CHECK-NOV-NEXT:    li a2, -1
-; CHECK-NOV-NEXT:    srli a3, a2, 1
-; CHECK-NOV-NEXT:    beqz s1, .LBB24_3
+; CHECK-NOV-NEXT:    mv a2, a0
+; CHECK-NOV-NEXT:    li a0, -1
+; CHECK-NOV-NEXT:    srli a3, a0, 1
+; CHECK-NOV-NEXT:    beqz a1, .LBB24_3
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    slti a4, s1, 0
-; CHECK-NOV-NEXT:    bnez a1, .LBB24_4
+; CHECK-NOV-NEXT:    slti a4, a1, 0
+; CHECK-NOV-NEXT:    bnez s1, .LBB24_4
 ; CHECK-NOV-NEXT:  .LBB24_2:
-; CHECK-NOV-NEXT:    sltu a5, a0, a3
+; CHECK-NOV-NEXT:    sltu a5, s0, a3
 ; CHECK-NOV-NEXT:    beqz a5, .LBB24_5
 ; CHECK-NOV-NEXT:    j .LBB24_6
 ; CHECK-NOV-NEXT:  .LBB24_3:
-; CHECK-NOV-NEXT:    sltu a4, s0, a3
-; CHECK-NOV-NEXT:    beqz a1, .LBB24_2
+; CHECK-NOV-NEXT:    sltu a4, a2, a3
+; CHECK-NOV-NEXT:    beqz s1, .LBB24_2
 ; CHECK-NOV-NEXT:  .LBB24_4: # %entry
-; CHECK-NOV-NEXT:    slti a5, a1, 0
+; CHECK-NOV-NEXT:    slti a5, s1, 0
 ; CHECK-NOV-NEXT:    bnez a5, .LBB24_6
 ; CHECK-NOV-NEXT:  .LBB24_5: # %entry
-; CHECK-NOV-NEXT:    mv a0, a3
+; CHECK-NOV-NEXT:    mv s0, a3
 ; CHECK-NOV-NEXT:  .LBB24_6: # %entry
 ; CHECK-NOV-NEXT:    neg a6, a5
 ; CHECK-NOV-NEXT:    neg a5, a4
-; CHECK-NOV-NEXT:    and a5, a5, s1
+; CHECK-NOV-NEXT:    and a5, a5, a1
 ; CHECK-NOV-NEXT:    bnez a4, .LBB24_8
 ; CHECK-NOV-NEXT:  # %bb.7: # %entry
-; CHECK-NOV-NEXT:    mv s0, a3
+; CHECK-NOV-NEXT:    mv a2, a3
 ; CHECK-NOV-NEXT:  .LBB24_8: # %entry
-; CHECK-NOV-NEXT:    and a4, a6, a1
-; CHECK-NOV-NEXT:    slli a1, a2, 63
-; CHECK-NOV-NEXT:    beq a5, a2, .LBB24_11
+; CHECK-NOV-NEXT:    and a4, a6, s1
+; CHECK-NOV-NEXT:    slli a1, a0, 63
+; CHECK-NOV-NEXT:    beq a5, a0, .LBB24_11
 ; CHECK-NOV-NEXT:  # %bb.9: # %entry
 ; CHECK-NOV-NEXT:    slti a3, a5, 0
 ; CHECK-NOV-NEXT:    xori a3, a3, 1
-; CHECK-NOV-NEXT:    bne a4, a2, .LBB24_12
+; CHECK-NOV-NEXT:    bne a4, a0, .LBB24_12
 ; CHECK-NOV-NEXT:  .LBB24_10:
-; CHECK-NOV-NEXT:    sltu a2, a1, a0
-; CHECK-NOV-NEXT:    beqz a2, .LBB24_13
+; CHECK-NOV-NEXT:    sltu a0, a1, s0
+; CHECK-NOV-NEXT:    beqz a0, .LBB24_13
 ; CHECK-NOV-NEXT:    j .LBB24_14
 ; CHECK-NOV-NEXT:  .LBB24_11:
-; CHECK-NOV-NEXT:    sltu a3, a1, s0
-; CHECK-NOV-NEXT:    beq a4, a2, .LBB24_10
+; CHECK-NOV-NEXT:    sltu a3, a1, a2
+; CHECK-NOV-NEXT:    beq a4, a0, .LBB24_10
 ; CHECK-NOV-NEXT:  .LBB24_12: # %entry
-; CHECK-NOV-NEXT:    slti a2, a4, 0
-; CHECK-NOV-NEXT:    xori a2, a2, 1
-; CHECK-NOV-NEXT:    bnez a2, .LBB24_14
+; CHECK-NOV-NEXT:    slti a0, a4, 0
+; CHECK-NOV-NEXT:    xori a0, a0, 1
+; CHECK-NOV-NEXT:    bnez a0, .LBB24_14
 ; CHECK-NOV-NEXT:  .LBB24_13: # %entry
-; CHECK-NOV-NEXT:    mv a0, a1
+; CHECK-NOV-NEXT:    mv s0, a1
 ; CHECK-NOV-NEXT:  .LBB24_14: # %entry
 ; CHECK-NOV-NEXT:    bnez a3, .LBB24_16
 ; CHECK-NOV-NEXT:  # %bb.15: # %entry
-; CHECK-NOV-NEXT:    mv s0, a1
+; CHECK-NOV-NEXT:    mv a2, a1
 ; CHECK-NOV-NEXT:  .LBB24_16: # %entry
-; CHECK-NOV-NEXT:    mv a1, s0
+; CHECK-NOV-NEXT:    mv a0, s0
+; CHECK-NOV-NEXT:    mv a1, a2
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -3082,8 +3086,8 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s0, -16
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    .cfi_offset s2, -32
-; CHECK-V-NEXT:    mv s2, a0
-; CHECK-V-NEXT:    fmv.w.x fa0, a1
+; CHECK-V-NEXT:    mv s2, a1
+; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    mv s0, a0
@@ -3093,31 +3097,31 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-V-NEXT:    call __fixsfti
 ; CHECK-V-NEXT:    li a2, -1
 ; CHECK-V-NEXT:    srli a3, a2, 1
-; CHECK-V-NEXT:    beqz s1, .LBB24_3
+; CHECK-V-NEXT:    beqz a1, .LBB24_3
 ; CHECK-V-NEXT:  # %bb.1: # %entry
-; CHECK-V-NEXT:    slti a4, s1, 0
-; CHECK-V-NEXT:    bnez a1, .LBB24_4
+; CHECK-V-NEXT:    slti a4, a1, 0
+; CHECK-V-NEXT:    bnez s1, .LBB24_4
 ; CHECK-V-NEXT:  .LBB24_2:
-; CHECK-V-NEXT:    sltu a5, a0, a3
+; CHECK-V-NEXT:    sltu a5, s0, a3
 ; CHECK-V-NEXT:    beqz a5, .LBB24_5
 ; CHECK-V-NEXT:    j .LBB24_6
 ; CHECK-V-NEXT:  .LBB24_3:
-; CHECK-V-NEXT:    sltu a4, s0, a3
-; CHECK-V-NEXT:    beqz a1, .LBB24_2
+; CHECK-V-NEXT:    sltu a4, a0, a3
+; CHECK-V-NEXT:    beqz s1, .LBB24_2
 ; CHECK-V-NEXT:  .LBB24_4: # %entry
-; CHECK-V-NEXT:    slti a5, a1, 0
+; CHECK-V-NEXT:    slti a5, s1, 0
 ; CHECK-V-NEXT:    bnez a5, .LBB24_6
 ; CHECK-V-NEXT:  .LBB24_5: # %entry
-; CHECK-V-NEXT:    mv a0, a3
+; CHECK-V-NEXT:    mv s0, a3
 ; CHECK-V-NEXT:  .LBB24_6: # %entry
 ; CHECK-V-NEXT:    neg a6, a5
 ; CHECK-V-NEXT:    neg a5, a4
-; CHECK-V-NEXT:    and a5, a5, s1
+; CHECK-V-NEXT:    and a5, a5, a1
 ; CHECK-V-NEXT:    bnez a4, .LBB24_8
 ; CHECK-V-NEXT:  # %bb.7: # %entry
-; CHECK-V-NEXT:    mv s0, a3
+; CHECK-V-NEXT:    mv a0, a3
 ; CHECK-V-NEXT:  .LBB24_8: # %entry
-; CHECK-V-NEXT:    and a4, a6, a1
+; CHECK-V-NEXT:    and a4, a6, s1
 ; CHECK-V-NEXT:    slli a1, a2, 63
 ; CHECK-V-NEXT:    beq a5, a2, .LBB24_11
 ; CHECK-V-NEXT:  # %bb.9: # %entry
@@ -3125,26 +3129,26 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-V-NEXT:    xori a3, a3, 1
 ; CHECK-V-NEXT:    bne a4, a2, .LBB24_12
 ; CHECK-V-NEXT:  .LBB24_10:
-; CHECK-V-NEXT:    sltu a2, a1, a0
+; CHECK-V-NEXT:    sltu a2, a1, s0
 ; CHECK-V-NEXT:    beqz a2, .LBB24_13
 ; CHECK-V-NEXT:    j .LBB24_14
 ; CHECK-V-NEXT:  .LBB24_11:
-; CHECK-V-NEXT:    sltu a3, a1, s0
+; CHECK-V-NEXT:    sltu a3, a1, a0
 ; CHECK-V-NEXT:    beq a4, a2, .LBB24_10
 ; CHECK-V-NEXT:  .LBB24_12: # %entry
 ; CHECK-V-NEXT:    slti a2, a4, 0
 ; CHECK-V-NEXT:    xori a2, a2, 1
 ; CHECK-V-NEXT:    bnez a2, .LBB24_14
 ; CHECK-V-NEXT:  .LBB24_13: # %entry
-; CHECK-V-NEXT:    mv a0, a1
+; CHECK-V-NEXT:    mv s0, a1
 ; CHECK-V-NEXT:  .LBB24_14: # %entry
 ; CHECK-V-NEXT:    bnez a3, .LBB24_16
 ; CHECK-V-NEXT:  # %bb.15: # %entry
-; CHECK-V-NEXT:    mv s0, a1
+; CHECK-V-NEXT:    mv a0, a1
 ; CHECK-V-NEXT:  .LBB24_16: # %entry
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v9, s0
-; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    vmv.s.x v9, a0
+; CHECK-V-NEXT:    vmv.s.x v8, s0
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -3175,8 +3179,8 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_offset s0, -16
 ; CHECK-NOV-NEXT:    .cfi_offset s1, -24
 ; CHECK-NOV-NEXT:    .cfi_offset s2, -32
-; CHECK-NOV-NEXT:    mv s0, a0
-; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
+; CHECK-NOV-NEXT:    mv s0, a1
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    call __fixunssfti
 ; CHECK-NOV-NEXT:    mv s1, a0
@@ -3184,12 +3188,13 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s0
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    call __fixunssfti
-; CHECK-NOV-NEXT:    snez a2, s2
 ; CHECK-NOV-NEXT:    snez a1, a1
+; CHECK-NOV-NEXT:    snez a2, s2
+; CHECK-NOV-NEXT:    addi a2, a2, -1
+; CHECK-NOV-NEXT:    and a2, a2, s1
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
-; CHECK-NOV-NEXT:    and a0, a1, a0
-; CHECK-NOV-NEXT:    addi a1, a2, -1
-; CHECK-NOV-NEXT:    and a1, a1, s1
+; CHECK-NOV-NEXT:    and a1, a1, a0
+; CHECK-NOV-NEXT:    mv a0, a2
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -3209,8 +3214,8 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s0, -16
 ; CHECK-V-NEXT:    .cfi_offset s1, -24
 ; CHECK-V-NEXT:    .cfi_offset s2, -32
-; CHECK-V-NEXT:    mv s0, a0
-; CHECK-V-NEXT:    fmv.w.x fa0, a1
+; CHECK-V-NEXT:    mv s0, a1
+; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    call __fixunssfti
 ; CHECK-V-NEXT:    mv s1, a0
@@ -3218,15 +3223,15 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    call __fixunssfti
-; CHECK-V-NEXT:    snez a2, s2
 ; CHECK-V-NEXT:    snez a1, a1
-; CHECK-V-NEXT:    addi a1, a1, -1
-; CHECK-V-NEXT:    and a0, a1, a0
+; CHECK-V-NEXT:    snez a2, s2
 ; CHECK-V-NEXT:    addi a2, a2, -1
 ; CHECK-V-NEXT:    and a2, a2, s1
+; CHECK-V-NEXT:    addi a1, a1, -1
+; CHECK-V-NEXT:    and a0, a1, a0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v9, a2
-; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    vmv.s.x v9, a0
+; CHECK-V-NEXT:    vmv.s.x v8, a2
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -3269,32 +3274,32 @@ define <2 x i64> @ustest_f16i64(<2 x half> %x) {
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    li a2, 1
 ; CHECK-NOV-NEXT:  .LBB26_2: # %entry
-; CHECK-NOV-NEXT:    slti a4, s1, 1
 ; CHECK-NOV-NEXT:    slti a3, a1, 1
+; CHECK-NOV-NEXT:    slti a4, s1, 1
 ; CHECK-NOV-NEXT:    blez a1, .LBB26_4
 ; CHECK-NOV-NEXT:  # %bb.3: # %entry
 ; CHECK-NOV-NEXT:    li a1, 1
 ; CHECK-NOV-NEXT:  .LBB26_4: # %entry
+; CHECK-NOV-NEXT:    neg a4, a4
 ; CHECK-NOV-NEXT:    neg a3, a3
 ; CHECK-NOV-NEXT:    and a3, a3, a0
-; CHECK-NOV-NEXT:    neg a0, a4
 ; CHECK-NOV-NEXT:    beqz a1, .LBB26_7
 ; CHECK-NOV-NEXT:  # %bb.5: # %entry
 ; CHECK-NOV-NEXT:    sgtz a1, a1
-; CHECK-NOV-NEXT:    and a0, a0, s0
+; CHECK-NOV-NEXT:    and a4, a4, s0
 ; CHECK-NOV-NEXT:    bnez a2, .LBB26_8
 ; CHECK-NOV-NEXT:  .LBB26_6:
-; CHECK-NOV-NEXT:    snez a2, a0
+; CHECK-NOV-NEXT:    snez a0, a4
 ; CHECK-NOV-NEXT:    j .LBB26_9
 ; CHECK-NOV-NEXT:  .LBB26_7:
 ; CHECK-NOV-NEXT:    snez a1, a3
-; CHECK-NOV-NEXT:    and a0, a0, s0
+; CHECK-NOV-NEXT:    and a4, a4, s0
 ; CHECK-NOV-NEXT:    beqz a2, .LBB26_6
 ; CHECK-NOV-NEXT:  .LBB26_8: # %entry
-; CHECK-NOV-NEXT:    sgtz a2, a2
+; CHECK-NOV-NEXT:    sgtz a0, a2
 ; CHECK-NOV-NEXT:  .LBB26_9: # %entry
-; CHECK-NOV-NEXT:    neg a2, a2
-; CHECK-NOV-NEXT:    and a0, a2, a0
+; CHECK-NOV-NEXT:    neg a0, a0
+; CHECK-NOV-NEXT:    and a0, a0, a4
 ; CHECK-NOV-NEXT:    neg a1, a1
 ; CHECK-NOV-NEXT:    and a1, a1, a3
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -3330,15 +3335,15 @@ define <2 x i64> @ustest_f16i64(<2 x half> %x) {
 ; CHECK-V-NEXT:  # %bb.1: # %entry
 ; CHECK-V-NEXT:    li a2, 1
 ; CHECK-V-NEXT:  .LBB26_2: # %entry
-; CHECK-V-NEXT:    slti a3, s1, 1
 ; CHECK-V-NEXT:    slti a4, a1, 1
+; CHECK-V-NEXT:    slti a3, s1, 1
 ; CHECK-V-NEXT:    blez a1, .LBB26_4
 ; CHECK-V-NEXT:  # %bb.3: # %entry
 ; CHECK-V-NEXT:    li a1, 1
 ; CHECK-V-NEXT:  .LBB26_4: # %entry
+; CHECK-V-NEXT:    neg a3, a3
 ; CHECK-V-NEXT:    neg a4, a4
 ; CHECK-V-NEXT:    and a0, a4, a0
-; CHECK-V-NEXT:    neg a3, a3
 ; CHECK-V-NEXT:    beqz a1, .LBB26_7
 ; CHECK-V-NEXT:  # %bb.5: # %entry
 ; CHECK-V-NEXT:    sgtz a1, a1
@@ -5811,15 +5816,15 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    mv s1, a1
 ; CHECK-NOV-NEXT:    fmv.d fa0, fs0
 ; CHECK-NOV-NEXT:    call __fixdfti
-; CHECK-NOV-NEXT:    mv a2, s1
-; CHECK-NOV-NEXT:    mv a3, a1
+; CHECK-NOV-NEXT:    mv a2, a1
 ; CHECK-NOV-NEXT:    blez a1, .LBB47_2
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    li a3, 1
+; CHECK-NOV-NEXT:    li a2, 1
 ; CHECK-NOV-NEXT:  .LBB47_2: # %entry
-; CHECK-NOV-NEXT:    blez a2, .LBB47_4
+; CHECK-NOV-NEXT:    mv a3, s1
+; CHECK-NOV-NEXT:    blez s1, .LBB47_4
 ; CHECK-NOV-NEXT:  # %bb.3: # %entry
-; CHECK-NOV-NEXT:    li a2, 1
+; CHECK-NOV-NEXT:    li a3, 1
 ; CHECK-NOV-NEXT:  .LBB47_4: # %entry
 ; CHECK-NOV-NEXT:    slti a1, a1, 1
 ; CHECK-NOV-NEXT:    neg a1, a1
@@ -5827,11 +5832,11 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    slti a0, s1, 1
 ; CHECK-NOV-NEXT:    neg a0, a0
 ; CHECK-NOV-NEXT:    and a0, a0, s0
+; CHECK-NOV-NEXT:    slti a3, a3, 0
+; CHECK-NOV-NEXT:    addi a3, a3, -1
+; CHECK-NOV-NEXT:    and a0, a3, a0
 ; CHECK-NOV-NEXT:    slti a2, a2, 0
 ; CHECK-NOV-NEXT:    addi a2, a2, -1
-; CHECK-NOV-NEXT:    and a0, a2, a0
-; CHECK-NOV-NEXT:    slti a2, a3, 0
-; CHECK-NOV-NEXT:    addi a2, a2, -1
 ; CHECK-NOV-NEXT:    and a1, a2, a1
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -5867,15 +5872,15 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixdfti
-; CHECK-V-NEXT:    mv a2, s1
-; CHECK-V-NEXT:    mv a3, a1
+; CHECK-V-NEXT:    mv a2, a1
 ; CHECK-V-NEXT:    blez a1, .LBB47_2
 ; CHECK-V-NEXT:  # %bb.1: # %entry
-; CHECK-V-NEXT:    li a3, 1
+; CHECK-V-NEXT:    li a2, 1
 ; CHECK-V-NEXT:  .LBB47_2: # %entry
-; CHECK-V-NEXT:    blez a2, .LBB47_4
+; CHECK-V-NEXT:    mv a3, s1
+; CHECK-V-NEXT:    blez s1, .LBB47_4
 ; CHECK-V-NEXT:  # %bb.3: # %entry
-; CHECK-V-NEXT:    li a2, 1
+; CHECK-V-NEXT:    li a3, 1
 ; CHECK-V-NEXT:  .LBB47_4: # %entry
 ; CHECK-V-NEXT:    slti a1, a1, 1
 ; CHECK-V-NEXT:    neg a1, a1
@@ -5883,11 +5888,11 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:    slti a1, s1, 1
 ; CHECK-V-NEXT:    neg a1, a1
 ; CHECK-V-NEXT:    and a1, a1, s0
+; CHECK-V-NEXT:    slti a3, a3, 0
+; CHECK-V-NEXT:    addi a3, a3, -1
+; CHECK-V-NEXT:    and a1, a3, a1
 ; CHECK-V-NEXT:    slti a2, a2, 0
 ; CHECK-V-NEXT:    addi a2, a2, -1
-; CHECK-V-NEXT:    and a1, a2, a1
-; CHECK-V-NEXT:    slti a2, a3, 0
-; CHECK-V-NEXT:    addi a2, a2, -1
 ; CHECK-V-NEXT:    and a0, a2, a0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
@@ -6197,15 +6202,15 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
 ; CHECK-NOV-NEXT:    mv s1, a1
 ; CHECK-NOV-NEXT:    fmv.s fa0, fs0
 ; CHECK-NOV-NEXT:    call __fixsfti
-; CHECK-NOV-NEXT:    mv a2, s1
-; CHECK-NOV-NEXT:    mv a3, a1
+; CHECK-NOV-NEXT:    mv a2, a1
 ; CHECK-NOV-NEXT:    blez a1, .LBB50_2
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    li a3, 1
+; CHECK-NOV-NEXT:    li a2, 1
 ; CHECK-NOV-NEXT:  .LBB50_2: # %entry
-; CHECK-NOV-NEXT:    blez a2, .LBB50_4
+; CHECK-NOV-NEXT:    mv a3, s1
+; CHECK-NOV-NEXT:    blez s1, .LBB50_4
 ; CHECK-NOV-NEXT:  # %bb.3: # %entry
-; CHECK-NOV-NEXT:    li a2, 1
+; CHECK-NOV-NEXT:    li a3, 1
 ; CHECK-NOV-NEXT:  .LBB50_4: # %entry
 ; CHECK-NOV-NEXT:    slti a1, a1, 1
 ; CHECK-NOV-NEXT:    neg a1, a1
@@ -6213,11 +6218,11 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
 ; CHECK-NOV-NEXT:    slti a0, s1, 1
 ; CHECK-NOV-NEXT:    neg a0, a0
 ; CHECK-NOV-NEXT:    and a0, a0, s0
+; CHECK-NOV-NEXT:    slti a3, a3, 0
+; CHECK-NOV-NEXT:    addi a3, a3, -1
+; CHECK-NOV-NEXT:    and a0, a3, a0
 ; CHECK-NOV-NEXT:    slti a2, a2, 0
 ; CHECK-NOV-NEXT:    addi a2, a2, -1
-; CHECK-NOV-NEXT:    and a0, a2, a0
-; CHECK-NOV-NEXT:    slti a2, a3, 0
-; CHECK-NOV-NEXT:    addi a2, a2, -1
 ; CHECK-NOV-NEXT:    and a1, a2, a1
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -6253,15 +6258,15 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixsfti
-; CHECK-V-NEXT:    mv a2, s1
-; CHECK-V-NEXT:    mv a3, a1
+; CHECK-V-NEXT:    mv a2, a1
 ; CHECK-V-NEXT:    blez a1, .LBB50_2
 ; CHECK-V-NEXT:  # %bb.1: # %entry
-; CHECK-V-NEXT:    li a3, 1
+; CHECK-V-NEXT:    li a2, 1
 ; CHECK-V-NEXT:  .LBB50_2: # %entry
-; CHECK-V-NEXT:    blez a2, .LBB50_4
+; CHECK-V-NEXT:    mv a3, s1
+; CHECK-V-NEXT:    blez s1, .LBB50_4
 ; CHECK-V-NEXT:  # %bb.3: # %entry
-; CHECK-V-NEXT:    li a2, 1
+; CHECK-V-NEXT:    li a3, 1
 ; CHECK-V-NEXT:  .LBB50_4: # %entry
 ; CHECK-V-NEXT:    slti a1, a1, 1
 ; CHECK-V-NEXT:    neg a1, a1
@@ -6269,11 +6274,11 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    slti a1, s1, 1
 ; CHECK-V-NEXT:    neg a1, a1
 ; CHECK-V-NEXT:    and a1, a1, s0
+; CHECK-V-NEXT:    slti a3, a3, 0
+; CHECK-V-NEXT:    addi a3, a3, -1
+; CHECK-V-NEXT:    and a1, a3, a1
 ; CHECK-V-NEXT:    slti a2, a2, 0
 ; CHECK-V-NEXT:    addi a2, a2, -1
-; CHECK-V-NEXT:    and a1, a2, a1
-; CHECK-V-NEXT:    slti a2, a3, 0
-; CHECK-V-NEXT:    addi a2, a2, -1
 ; CHECK-V-NEXT:    and a0, a2, a0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
@@ -6575,15 +6580,15 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    call __fixsfti
-; CHECK-NOV-NEXT:    mv a2, s1
-; CHECK-NOV-NEXT:    mv a3, a1
+; CHECK-NOV-NEXT:    mv a2, a1
 ; CHECK-NOV-NEXT:    blez a1, .LBB53_2
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    li a3, 1
+; CHECK-NOV-NEXT:    li a2, 1
 ; CHECK-NOV-NEXT:  .LBB53_2: # %entry
-; CHECK-NOV-NEXT:    blez a2, .LBB53_4
+; CHECK-NOV-NEXT:    mv a3, s1
+; CHECK-NOV-NEXT:    blez s1, .LBB53_4
 ; CHECK-NOV-NEXT:  # %bb.3: # %entry
-; CHECK-NOV-NEXT:    li a2, 1
+; CHECK-NOV-NEXT:    li a3, 1
 ; CHECK-NOV-NEXT:  .LBB53_4: # %entry
 ; CHECK-NOV-NEXT:    slti a1, a1, 1
 ; CHECK-NOV-NEXT:    neg a1, a1
@@ -6591,11 +6596,11 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    slti a0, s1, 1
 ; CHECK-NOV-NEXT:    neg a0, a0
 ; CHECK-NOV-NEXT:    and a0, a0, s0
+; CHECK-NOV-NEXT:    slti a3, a3, 0
+; CHECK-NOV-NEXT:    addi a3, a3, -1
+; CHECK-NOV-NEXT:    and a0, a3, a0
 ; CHECK-NOV-NEXT:    slti a2, a2, 0
 ; CHECK-NOV-NEXT:    addi a2, a2, -1
-; CHECK-NOV-NEXT:    and a0, a2, a0
-; CHECK-NOV-NEXT:    slti a2, a3, 0
-; CHECK-NOV-NEXT:    addi a2, a2, -1
 ; CHECK-NOV-NEXT:    and a1, a2, a1
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -6625,15 +6630,15 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
 ; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    call __fixsfti
-; CHECK-V-NEXT:    mv a2, s1
-; CHECK-V-NEXT:    mv a3, a1
+; CHECK-V-NEXT:    mv a2, a1
 ; CHECK-V-NEXT:    blez a1, .LBB53_2
 ; CHECK-V-NEXT:  # %bb.1: # %entry
-; CHECK-V-NEXT:    li a3, 1
+; CHECK-V-NEXT:    li a2, 1
 ; CHECK-V-NEXT:  .LBB53_2: # %entry
-; CHECK-V-NEXT:    blez a2, .LBB53_4
+; CHECK-V-NEXT:    mv a3, s1
+; CHECK-V-NEXT:    blez s1, .LBB53_4
 ; CHECK-V-NEXT:  # %bb.3: # %entry
-; CHECK-V-NEXT:    li a2, 1
+; CHECK-V-NEXT:    li a3, 1
 ; CHECK-V-NEXT:  .LBB53_4: # %entry
 ; CHECK-V-NEXT:    slti a1, a1, 1
 ; CHECK-V-NEXT:    neg a1, a1
@@ -6641,11 +6646,11 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
 ; CHECK-V-NEXT:    slti a1, s1, 1
 ; CHECK-V-NEXT:    neg a1, a1
 ; CHECK-V-NEXT:    and a1, a1, s0
+; CHECK-V-NEXT:    slti a3, a3, 0
+; CHECK-V-NEXT:    addi a3, a3, -1
+; CHECK-V-NEXT:    and a1, a3, a1
 ; CHECK-V-NEXT:    slti a2, a2, 0
 ; CHECK-V-NEXT:    addi a2, a2, -1
-; CHECK-V-NEXT:    and a1, a2, a1
-; CHECK-V-NEXT:    slti a2, a3, 0
-; CHECK-V-NEXT:    addi a2, a2, -1
 ; CHECK-V-NEXT:    and a0, a2, a0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v9, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
index 4e08f401ca4e..96094eea631b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
@@ -104,7 +104,7 @@ define <vscale x 2 x i1> @reverse_nxv2i1(<vscale x 2 x i1> %a) {
 ; RV64-BITS-512-NEXT:    vand.vi v8, v10, 1
 ; RV64-BITS-512-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %a)
+  %res = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %a)
   ret <vscale x 2 x i1> %res
 }
 
@@ -202,7 +202,7 @@ define <vscale x 4 x i1> @reverse_nxv4i1(<vscale x 4 x i1> %a) {
 ; RV64-BITS-512-NEXT:    vand.vi v8, v10, 1
 ; RV64-BITS-512-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %res = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
   ret <vscale x 4 x i1> %res
 }
 
@@ -294,7 +294,7 @@ define <vscale x 8 x i1> @reverse_nxv8i1(<vscale x 8 x i1> %a) {
 ; RV64-BITS-512-NEXT:    vand.vi v8, v10, 1
 ; RV64-BITS-512-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %a)
+  %res = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> %a)
   ret <vscale x 8 x i1> %res
 }
 
@@ -392,7 +392,7 @@ define <vscale x 16 x i1> @reverse_nxv16i1(<vscale x 16 x i1> %a) {
 ; RV64-BITS-512-NEXT:    vand.vi v8, v12, 1
 ; RV64-BITS-512-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %a)
+  %res = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> %a)
   ret <vscale x 16 x i1> %res
 }
 
@@ -490,7 +490,7 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) {
 ; RV64-BITS-512-NEXT:    vand.vi v8, v16, 1
 ; RV64-BITS-512-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 32 x i1> @llvm.experimental.vector.reverse.nxv32i1(<vscale x 32 x i1> %a)
+  %res = call <vscale x 32 x i1> @llvm.vector.reverse.nxv32i1(<vscale x 32 x i1> %a)
   ret <vscale x 32 x i1> %res
 }
 
@@ -600,7 +600,7 @@ define <vscale x 64 x i1> @reverse_nxv64i1(<vscale x 64 x i1> %a) {
 ; RV64-BITS-512-NEXT:    vand.vi v8, v24, 1
 ; RV64-BITS-512-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 64 x i1> @llvm.experimental.vector.reverse.nxv64i1(<vscale x 64 x i1> %a)
+  %res = call <vscale x 64 x i1> @llvm.vector.reverse.nxv64i1(<vscale x 64 x i1> %a)
   ret <vscale x 64 x i1> %res
 }
 
@@ -682,7 +682,7 @@ define <vscale x 1 x i8> @reverse_nxv1i8(<vscale x 1 x i8> %a) {
 ; RV64-BITS-512-NEXT:    vrgather.vv v9, v8, v10
 ; RV64-BITS-512-NEXT:    vmv1r.v v8, v9
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 1 x i8> @llvm.experimental.vector.reverse.nxv1i8(<vscale x 1 x i8> %a)
+  %res = call <vscale x 1 x i8> @llvm.vector.reverse.nxv1i8(<vscale x 1 x i8> %a)
   ret <vscale x 1 x i8> %res
 }
 
@@ -760,7 +760,7 @@ define <vscale x 2 x i8> @reverse_nxv2i8(<vscale x 2 x i8> %a) {
 ; RV64-BITS-512-NEXT:    vrgather.vv v9, v8, v10
 ; RV64-BITS-512-NEXT:    vmv1r.v v8, v9
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.reverse.nxv2i8(<vscale x 2 x i8> %a)
+  %res = call <vscale x 2 x i8> @llvm.vector.reverse.nxv2i8(<vscale x 2 x i8> %a)
   ret <vscale x 2 x i8> %res
 }
 
@@ -838,7 +838,7 @@ define <vscale x 4 x i8> @reverse_nxv4i8(<vscale x 4 x i8> %a) {
 ; RV64-BITS-512-NEXT:    vrgather.vv v9, v8, v10
 ; RV64-BITS-512-NEXT:    vmv1r.v v8, v9
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 4 x i8> @llvm.experimental.vector.reverse.nxv4i8(<vscale x 4 x i8> %a)
+  %res = call <vscale x 4 x i8> @llvm.vector.reverse.nxv4i8(<vscale x 4 x i8> %a)
   ret <vscale x 4 x i8> %res
 }
 
@@ -910,7 +910,7 @@ define <vscale x 8 x i8> @reverse_nxv8i8(<vscale x 8 x i8> %a) {
 ; RV64-BITS-512-NEXT:    vrgather.vv v9, v8, v10
 ; RV64-BITS-512-NEXT:    vmv.v.v v8, v9
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 8 x i8> @llvm.experimental.vector.reverse.nxv8i8(<vscale x 8 x i8> %a)
+  %res = call <vscale x 8 x i8> @llvm.vector.reverse.nxv8i8(<vscale x 8 x i8> %a)
   ret <vscale x 8 x i8> %res
 }
 
@@ -988,7 +988,7 @@ define <vscale x 16 x i8> @reverse_nxv16i8(<vscale x 16 x i8> %a) {
 ; RV64-BITS-512-NEXT:    vrgather.vv v10, v8, v12
 ; RV64-BITS-512-NEXT:    vmv.v.v v8, v10
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> %a)
+  %res = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> %a)
   ret <vscale x 16 x i8> %res
 }
 
@@ -1066,7 +1066,7 @@ define <vscale x 32 x i8> @reverse_nxv32i8(<vscale x 32 x i8> %a) {
 ; RV64-BITS-512-NEXT:    vrgather.vv v12, v8, v16
 ; RV64-BITS-512-NEXT:    vmv.v.v v8, v12
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8> %a)
+  %res = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> %a)
   ret <vscale x 32 x i8> %res
 }
 
@@ -1148,7 +1148,7 @@ define <vscale x 64 x i8> @reverse_nxv64i8(<vscale x 64 x i8> %a) {
 ; RV64-BITS-512-NEXT:    vrgather.vv v16, v12, v24
 ; RV64-BITS-512-NEXT:    vmv8r.v v8, v16
 ; RV64-BITS-512-NEXT:    ret
-  %res = call <vscale x 64 x i8> @llvm.experimental.vector.reverse.nxv64i8(<vscale x 64 x i8> %a)
+  %res = call <vscale x 64 x i8> @llvm.vector.reverse.nxv64i8(<vscale x 64 x i8> %a)
   ret <vscale x 64 x i8> %res
 }
 
@@ -1164,7 +1164,7 @@ define <vscale x 1 x i16> @reverse_nxv1i16(<vscale x 1 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i16> @llvm.experimental.vector.reverse.nxv1i16(<vscale x 1 x i16> %a)
+  %res = call <vscale x 1 x i16> @llvm.vector.reverse.nxv1i16(<vscale x 1 x i16> %a)
   ret <vscale x 1 x i16> %res
 }
 
@@ -1180,7 +1180,7 @@ define <vscale x 2 x i16> @reverse_nxv2i16(<vscale x 2 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16> %a)
+  %res = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> %a)
   ret <vscale x 2 x i16> %res
 }
 
@@ -1196,7 +1196,7 @@ define <vscale x 4 x i16> @reverse_nxv4i16(<vscale x 4 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16> %a)
+  %res = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> %a)
   ret <vscale x 4 x i16> %res
 }
 
@@ -1211,7 +1211,7 @@ define <vscale x 8 x i16> @reverse_nxv8i16(<vscale x 8 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> %a)
+  %res = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> %a)
   ret <vscale x 8 x i16> %res
 }
 
@@ -1227,7 +1227,7 @@ define <vscale x 16 x i16> @reverse_nxv16i16(<vscale x 16 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16> %a)
+  %res = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> %a)
   ret <vscale x 16 x i16> %res
 }
 
@@ -1243,7 +1243,7 @@ define <vscale x 32 x i16> @reverse_nxv32i16(<vscale x 32 x i16> %a) {
 ; CHECK-NEXT:    vrgather.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i16> @llvm.experimental.vector.reverse.nxv32i16(<vscale x 32 x i16> %a)
+  %res = call <vscale x 32 x i16> @llvm.vector.reverse.nxv32i16(<vscale x 32 x i16> %a)
   ret <vscale x 32 x i16> %res
 }
 
@@ -1259,7 +1259,7 @@ define <vscale x 1 x i32> @reverse_nxv1i32(<vscale x 1 x i32> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i32> @llvm.experimental.vector.reverse.nxv1i32(<vscale x 1 x i32> %a)
+  %res = call <vscale x 1 x i32> @llvm.vector.reverse.nxv1i32(<vscale x 1 x i32> %a)
   ret <vscale x 1 x i32> %res
 }
 
@@ -1275,7 +1275,7 @@ define <vscale x 2 x i32> @reverse_nxv2i32(<vscale x 2 x i32> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i32> @llvm.experimental.vector.reverse.nxv2i32(<vscale x 2 x i32> %a)
+  %res = call <vscale x 2 x i32> @llvm.vector.reverse.nxv2i32(<vscale x 2 x i32> %a)
   ret <vscale x 2 x i32> %res
 }
 
@@ -1291,7 +1291,7 @@ define <vscale x 4 x i32> @reverse_nxv4i32(<vscale x 4 x i32> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %res = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   ret <vscale x 4 x i32> %res
 }
 
@@ -1306,7 +1306,7 @@ define <vscale x 8 x i32> @reverse_nxv8i32(<vscale x 8 x i32> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> %a)
+  %res = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> %a)
   ret <vscale x 8 x i32> %res
 }
 
@@ -1322,7 +1322,7 @@ define <vscale x 16 x i32> @reverse_nxv16i32(<vscale x 16 x i32> %a) {
 ; CHECK-NEXT:    vrgather.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i32> @llvm.experimental.vector.reverse.nxv16i32(<vscale x 16 x i32> %a)
+  %res = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> %a)
   ret <vscale x 16 x i32> %res
 }
 
@@ -1338,7 +1338,7 @@ define <vscale x 1 x i64> @reverse_nxv1i64(<vscale x 1 x i64> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i64> @llvm.experimental.vector.reverse.nxv1i64(<vscale x 1 x i64> %a)
+  %res = call <vscale x 1 x i64> @llvm.vector.reverse.nxv1i64(<vscale x 1 x i64> %a)
   ret <vscale x 1 x i64> %res
 }
 
@@ -1354,7 +1354,7 @@ define <vscale x 2 x i64> @reverse_nxv2i64(<vscale x 2 x i64> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> %a)
+  %res = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> %a)
   ret <vscale x 2 x i64> %res
 }
 
@@ -1370,7 +1370,7 @@ define <vscale x 4 x i64> @reverse_nxv4i64(<vscale x 4 x i64> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64> %a)
+  %res = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> %a)
   ret <vscale x 4 x i64> %res
 }
 
@@ -1385,7 +1385,7 @@ define <vscale x 8 x i64> @reverse_nxv8i64(<vscale x 8 x i64> %a) {
 ; CHECK-NEXT:    vrgather.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64> %a)
+  %res = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> %a)
   ret <vscale x 8 x i64> %res
 }
 
@@ -1405,7 +1405,7 @@ define <vscale x 1 x half> @reverse_nxv1f16(<vscale x 1 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x half> @llvm.experimental.vector.reverse.nxv1f16(<vscale x 1 x half> %a)
+  %res = call <vscale x 1 x half> @llvm.vector.reverse.nxv1f16(<vscale x 1 x half> %a)
   ret <vscale x 1 x half> %res
 }
 
@@ -1421,7 +1421,7 @@ define <vscale x 2 x half> @reverse_nxv2f16(<vscale x 2 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half> %a)
+  %res = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> %a)
   ret <vscale x 2 x half> %res
 }
 
@@ -1437,7 +1437,7 @@ define <vscale x 4 x half> @reverse_nxv4f16(<vscale x 4 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half> %a)
+  %res = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> %a)
   ret <vscale x 4 x half> %res
 }
 
@@ -1452,7 +1452,7 @@ define <vscale x 8 x half> @reverse_nxv8f16(<vscale x 8 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half> %a)
+  %res = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> %a)
   ret <vscale x 8 x half> %res
 }
 
@@ -1468,7 +1468,7 @@ define <vscale x 16 x half> @reverse_nxv16f16(<vscale x 16 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.experimental.vector.reverse.nxv16f16(<vscale x 16 x half> %a)
+  %res = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> %a)
   ret <vscale x 16 x half> %res
 }
 
@@ -1484,7 +1484,7 @@ define <vscale x 32 x half> @reverse_nxv32f16(<vscale x 32 x half> %a) {
 ; CHECK-NEXT:    vrgather.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x half> @llvm.experimental.vector.reverse.nxv32f16(<vscale x 32 x half> %a)
+  %res = call <vscale x 32 x half> @llvm.vector.reverse.nxv32f16(<vscale x 32 x half> %a)
   ret <vscale x 32 x half> %res
 }
 
@@ -1500,7 +1500,7 @@ define <vscale x 1 x float> @reverse_nxv1f32(<vscale x 1 x float> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x float> @llvm.experimental.vector.reverse.nxv1f32(<vscale x 1 x float> %a)
+  %res = call <vscale x 1 x float> @llvm.vector.reverse.nxv1f32(<vscale x 1 x float> %a)
   ret <vscale x 1 x float> %res
 }
 
@@ -1516,7 +1516,7 @@ define <vscale x 2 x float> @reverse_nxv2f32(<vscale x 2 x float> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float> %a)
+  %res = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> %a)
   ret <vscale x 2 x float> %res
 }
 
@@ -1532,7 +1532,7 @@ define <vscale x 4 x float> @reverse_nxv4f32(<vscale x 4 x float> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %res = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   ret <vscale x 4 x float> %res
 }
 
@@ -1547,7 +1547,7 @@ define <vscale x 8 x float> @reverse_nxv8f32(<vscale x 8 x float> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.experimental.vector.reverse.nxv8f32(<vscale x 8 x float> %a)
+  %res = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> %a)
   ret <vscale x 8 x float> %res
 }
 
@@ -1563,7 +1563,7 @@ define <vscale x 16 x float> @reverse_nxv16f32(<vscale x 16 x float> %a) {
 ; CHECK-NEXT:    vrgather.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.reverse.nxv16f32(<vscale x 16 x float> %a)
+  %res = call <vscale x 16 x float> @llvm.vector.reverse.nxv16f32(<vscale x 16 x float> %a)
   ret <vscale x 16 x float> %res
 }
 
@@ -1579,7 +1579,7 @@ define <vscale x 1 x double> @reverse_nxv1f64(<vscale x 1 x double> %a) {
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x double> @llvm.experimental.vector.reverse.nxv1f64(<vscale x 1 x double> %a)
+  %res = call <vscale x 1 x double> @llvm.vector.reverse.nxv1f64(<vscale x 1 x double> %a)
   ret <vscale x 1 x double> %res
 }
 
@@ -1595,7 +1595,7 @@ define <vscale x 2 x double> @reverse_nxv2f64(<vscale x 2 x double> %a) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> %a)
+  %res = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> %a)
   ret <vscale x 2 x double> %res
 }
 
@@ -1611,7 +1611,7 @@ define <vscale x 4 x double> @reverse_nxv4f64(<vscale x 4 x double> %a) {
 ; CHECK-NEXT:    vrgather.vv v12, v8, v16
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> %a)
+  %res = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %a)
   ret <vscale x 4 x double> %res
 }
 
@@ -1626,7 +1626,7 @@ define <vscale x 8 x double> @reverse_nxv8f64(<vscale x 8 x double> %a) {
 ; CHECK-NEXT:    vrgather.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x double> @llvm.experimental.vector.reverse.nxv8f64(<vscale x 8 x double> %a)
+  %res = call <vscale x 8 x double> @llvm.vector.reverse.nxv8f64(<vscale x 8 x double> %a)
   ret <vscale x 8 x double> %res
 }
 
@@ -1646,7 +1646,7 @@ define <vscale x 3 x i64> @reverse_nxv3i64(<vscale x 3 x i64> %a) {
 ; CHECK-NEXT:    vmv1r.v v9, v18
 ; CHECK-NEXT:    vmv1r.v v10, v19
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 3 x i64> @llvm.experimental.vector.reverse.nxv3i64(<vscale x 3 x i64> %a)
+  %res = call <vscale x 3 x i64> @llvm.vector.reverse.nxv3i64(<vscale x 3 x i64> %a)
   ret <vscale x 3 x i64> %res
 }
 
@@ -1663,7 +1663,7 @@ define <vscale x 6 x i64> @reverse_nxv6i64(<vscale x 6 x i64> %a) {
 ; CHECK-NEXT:    vmv2r.v v10, v28
 ; CHECK-NEXT:    vmv2r.v v12, v30
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 6 x i64> @llvm.experimental.vector.reverse.nxv6i64(<vscale x 6 x i64> %a)
+  %res = call <vscale x 6 x i64> @llvm.vector.reverse.nxv6i64(<vscale x 6 x i64> %a)
   ret <vscale x 6 x i64> %res
 }
 
@@ -1739,53 +1739,53 @@ define <vscale x 12 x i64> @reverse_nxv12i64(<vscale x 12 x i64> %a) {
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 80
 ; RV64-NEXT:    ret
-  %res = call <vscale x 12 x i64> @llvm.experimental.vector.reverse.nxv12i64(<vscale x 12 x i64> %a)
+  %res = call <vscale x 12 x i64> @llvm.vector.reverse.nxv12i64(<vscale x 12 x i64> %a)
   ret <vscale x 12 x i64> %res
 }
 
-declare <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1>)
-declare <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1>)
-declare <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1>)
-declare <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1>)
-declare <vscale x 32 x i1> @llvm.experimental.vector.reverse.nxv32i1(<vscale x 32 x i1>)
-declare <vscale x 64 x i1> @llvm.experimental.vector.reverse.nxv64i1(<vscale x 64 x i1>)
-declare <vscale x 1 x i8> @llvm.experimental.vector.reverse.nxv1i8(<vscale x 1 x i8>)
-declare <vscale x 2 x i8> @llvm.experimental.vector.reverse.nxv2i8(<vscale x 2 x i8>)
-declare <vscale x 4 x i8> @llvm.experimental.vector.reverse.nxv4i8(<vscale x 4 x i8>)
-declare <vscale x 8 x i8> @llvm.experimental.vector.reverse.nxv8i8(<vscale x 8 x i8>)
-declare <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8>)
-declare <vscale x 32 x i8> @llvm.experimental.vector.reverse.nxv32i8(<vscale x 32 x i8>)
-declare <vscale x 64 x i8> @llvm.experimental.vector.reverse.nxv64i8(<vscale x 64 x i8>)
-declare <vscale x 1 x i16> @llvm.experimental.vector.reverse.nxv1i16(<vscale x 1 x i16>)
-declare <vscale x 2 x i16> @llvm.experimental.vector.reverse.nxv2i16(<vscale x 2 x i16>)
-declare <vscale x 4 x i16> @llvm.experimental.vector.reverse.nxv4i16(<vscale x 4 x i16>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.reverse.nxv16i16(<vscale x 16 x i16>)
-declare <vscale x 32 x i16> @llvm.experimental.vector.reverse.nxv32i16(<vscale x 32 x i16>)
-declare <vscale x 1 x i32> @llvm.experimental.vector.reverse.nxv1i32(<vscale x 1 x i32>)
-declare <vscale x 2 x i32> @llvm.experimental.vector.reverse.nxv2i32(<vscale x 2 x i32>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32>)
-declare <vscale x 16 x i32> @llvm.experimental.vector.reverse.nxv16i32(<vscale x 16 x i32>)
-declare <vscale x 1 x i64> @llvm.experimental.vector.reverse.nxv1i64(<vscale x 1 x i64>)
-declare <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.reverse.nxv4i64(<vscale x 4 x i64>)
-declare <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64>)
-declare <vscale x 1 x half> @llvm.experimental.vector.reverse.nxv1f16(<vscale x 1 x half>)
-declare <vscale x 2 x half> @llvm.experimental.vector.reverse.nxv2f16(<vscale x 2 x half>)
-declare <vscale x 4 x half> @llvm.experimental.vector.reverse.nxv4f16(<vscale x 4 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half>)
-declare <vscale x 16 x half> @llvm.experimental.vector.reverse.nxv16f16(<vscale x 16 x half>)
-declare <vscale x 32 x half> @llvm.experimental.vector.reverse.nxv32f16(<vscale x 32 x half>)
-declare <vscale x 1 x float> @llvm.experimental.vector.reverse.nxv1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.experimental.vector.reverse.nxv2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.experimental.vector.reverse.nxv8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.experimental.vector.reverse.nxv16f32(<vscale x 16 x float>)
-declare <vscale x 1 x double> @llvm.experimental.vector.reverse.nxv1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.experimental.vector.reverse.nxv8f64(<vscale x 8 x double>)
-declare <vscale x 3 x i64> @llvm.experimental.vector.reverse.nxv3i64(<vscale x 3 x i64>)
-declare <vscale x 6 x i64> @llvm.experimental.vector.reverse.nxv6i64(<vscale x 6 x i64>)
-declare <vscale x 12 x i64> @llvm.experimental.vector.reverse.nxv12i64(<vscale x 12 x i64>)
+declare <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1>)
+declare <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1>)
+declare <vscale x 32 x i1> @llvm.vector.reverse.nxv32i1(<vscale x 32 x i1>)
+declare <vscale x 64 x i1> @llvm.vector.reverse.nxv64i1(<vscale x 64 x i1>)
+declare <vscale x 1 x i8> @llvm.vector.reverse.nxv1i8(<vscale x 1 x i8>)
+declare <vscale x 2 x i8> @llvm.vector.reverse.nxv2i8(<vscale x 2 x i8>)
+declare <vscale x 4 x i8> @llvm.vector.reverse.nxv4i8(<vscale x 4 x i8>)
+declare <vscale x 8 x i8> @llvm.vector.reverse.nxv8i8(<vscale x 8 x i8>)
+declare <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8>)
+declare <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8>)
+declare <vscale x 64 x i8> @llvm.vector.reverse.nxv64i8(<vscale x 64 x i8>)
+declare <vscale x 1 x i16> @llvm.vector.reverse.nxv1i16(<vscale x 1 x i16>)
+declare <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16>)
+declare <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16>)
+declare <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16>)
+declare <vscale x 32 x i16> @llvm.vector.reverse.nxv32i16(<vscale x 32 x i16>)
+declare <vscale x 1 x i32> @llvm.vector.reverse.nxv1i32(<vscale x 1 x i32>)
+declare <vscale x 2 x i32> @llvm.vector.reverse.nxv2i32(<vscale x 2 x i32>)
+declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32>)
+declare <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32>)
+declare <vscale x 1 x i64> @llvm.vector.reverse.nxv1i64(<vscale x 1 x i64>)
+declare <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64>)
+declare <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64>)
+declare <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64>)
+declare <vscale x 1 x half> @llvm.vector.reverse.nxv1f16(<vscale x 1 x half>)
+declare <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half>)
+declare <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half>)
+declare <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half>)
+declare <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half>)
+declare <vscale x 32 x half> @llvm.vector.reverse.nxv32f16(<vscale x 32 x half>)
+declare <vscale x 1 x float> @llvm.vector.reverse.nxv1f32(<vscale x 1 x float>)
+declare <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float>)
+declare <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float>)
+declare <vscale x 16 x float> @llvm.vector.reverse.nxv16f32(<vscale x 16 x float>)
+declare <vscale x 1 x double> @llvm.vector.reverse.nxv1f64(<vscale x 1 x double>)
+declare <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 8 x double> @llvm.vector.reverse.nxv8f64(<vscale x 8 x double>)
+declare <vscale x 3 x i64> @llvm.vector.reverse.nxv3i64(<vscale x 3 x i64>)
+declare <vscale x 6 x i64> @llvm.vector.reverse.nxv6i64(<vscale x 6 x i64>)
+declare <vscale x 12 x i64> @llvm.vector.reverse.nxv12i64(<vscale x 12 x i64>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index f3c70ed78c74..d02fe5b205f7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -31,7 +31,7 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_v16i1_v32i1(<32 x i1> %vec) {
 ; CHECK-NEXT:    vmsne.vi v8, v13, 0
 ; CHECK-NEXT:    vmv.v.v v0, v9
 ; CHECK-NEXT:    ret
-%retval = call {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1> %vec)
+%retval = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec)
 ret {<16 x i1>, <16 x i1>} %retval
 }
 
@@ -44,7 +44,7 @@ define {<16 x i8>, <16 x i8>} @vector_deinterleave_v16i8_v32i8(<32 x i8> %vec) {
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    vmv.v.v v9, v11
 ; CHECK-NEXT:    ret
-%retval = call {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %vec)
+%retval = call {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8> %vec)
 ret {<16 x i8>, <16 x i8>} %retval
 }
 
@@ -57,7 +57,7 @@ define {<8 x i16>, <8 x i16>} @vector_deinterleave_v8i16_v16i16(<16 x i16> %vec)
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    vmv.v.v v9, v11
 ; CHECK-NEXT:    ret
-%retval = call {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %vec)
+%retval = call {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16> %vec)
 ret {<8 x i16>, <8 x i16>} %retval
 }
 
@@ -71,7 +71,7 @@ define {<4 x i32>, <4 x i32>} @vector_deinterleave_v4i32_vv8i32(<8 x i32> %vec)
 ; CHECK-NEXT:    vmv.v.v v8, v11
 ; CHECK-NEXT:    vmv.v.v v9, v10
 ; CHECK-NEXT:    ret
-%retval = call {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %vec)
+%retval = call {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32> %vec)
 ret {<4 x i32>, <4 x i32>} %retval
 }
 
@@ -87,15 +87,15 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) {
 ; CHECK-NEXT:    vmerge.vvm v9, v9, v10, v0
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-%retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec)
+%retval = call {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64> %vec)
 ret {<2 x i64>, <2 x i64>} %retval
 }
 
-declare {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1>)
-declare {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>)
-declare {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>)
-declare {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>)
-declare {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>)
+declare {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1>)
+declare {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8>)
+declare {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16>)
+declare {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32>)
+declare {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64>)
 
 ; Floats
 
@@ -107,7 +107,7 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec
 ; CHECK-NEXT:    vnsrl.wi v9, v8, 16
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
-%retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec)
+%retval = call {<2 x half>, <2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half> %vec)
 ret {<2 x half>, <2 x half>} %retval
 }
 
@@ -119,7 +119,7 @@ define {<4 x half>, <4 x half>} @vector_deinterleave_v4f16_v8f16(<8 x half> %vec
 ; CHECK-NEXT:    vnsrl.wi v9, v8, 16
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
-%retval = call {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %vec)
+%retval = call {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half> %vec)
 ret {<4 x half>, <4 x half>} %retval
 }
 
@@ -131,7 +131,7 @@ define {<2 x float>, <2 x float>} @vector_deinterleave_v2f32_v4f32(<4 x float> %
 ; CHECK-NEXT:    vnsrl.wx v9, v8, a0
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    ret
-%retval = call {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float> %vec)
+%retval = call {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float> %vec)
 ret {<2 x float>, <2 x float>} %retval
 }
 
@@ -144,7 +144,7 @@ define {<8 x half>, <8 x half>} @vector_deinterleave_v8f16_v16f16(<16 x half> %v
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    vmv.v.v v9, v11
 ; CHECK-NEXT:    ret
-%retval = call {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %vec)
+%retval = call {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half> %vec)
 ret {<8 x half>, <8 x half>} %retval
 }
 
@@ -158,7 +158,7 @@ define {<4 x float>, <4 x float>} @vector_deinterleave_v4f32_v8f32(<8 x float> %
 ; CHECK-NEXT:    vmv.v.v v8, v11
 ; CHECK-NEXT:    vmv.v.v v9, v10
 ; CHECK-NEXT:    ret
-%retval = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %vec)
+%retval = call {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float> %vec)
 ret  {<4 x float>, <4 x float>} %retval
 }
 
@@ -174,13 +174,13 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double
 ; CHECK-NEXT:    vmerge.vvm v9, v9, v10, v0
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-%retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec)
+%retval = call {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double> %vec)
 ret {<2 x double>, <2 x double>} %retval
 }
 
-declare {<2 x half>,<2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>)
-declare {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>)
-declare {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float>)
-declare {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>)
-declare {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>)
-declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>)
+declare {<2 x half>,<2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half>)
+declare {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half>)
+declare {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float>)
+declare {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half>)
+declare {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float>)
+declare {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index 6a712080fda7..8f4ff37fffb0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -24,7 +24,7 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_load_nxv16i
 ; CHECK-NEXT:    vmsne.vi v9, v10, 0
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 32 x i1>, ptr %p
-  %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
+  %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
   ret {<vscale x 16 x i1>, <vscale x 16 x i1>} %retval
 }
 
@@ -35,7 +35,7 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_load_nxv16i
 ; CHECK-NEXT:    vlseg2e8.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 32 x i8>, ptr %p
-  %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
+  %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
   ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %retval
 }
 
@@ -49,7 +49,7 @@ define {<vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_load_nxv8i1
 ; CHECK-NEXT:    vnsrl.wi v10, v12, 16
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 16 x i16>, ptr %p, align 1
-  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
+  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
   ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %retval
 }
 
@@ -60,7 +60,7 @@ define {<vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_load_nxv8i1
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 16 x i16>, ptr %p
-  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
+  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
   ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %retval
 }
 
@@ -71,7 +71,7 @@ define {<vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_load_nxv4i3
 ; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 8 x i32>, ptr %p
-  %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
+  %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
   ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %retval
 }
 
@@ -82,7 +82,7 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_load_nxv2i6
 ; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 4 x i64>, ptr %p
-  %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
+  %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
   ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %retval
 }
 
@@ -93,7 +93,7 @@ define {<vscale x 4 x i64>, <vscale x 4 x i64>} @vector_deinterleave_load_nxv4i6
 ; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 8 x i64>, ptr %p
-  %retval = call {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %vec)
+  %retval = call {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %vec)
   ret {<vscale x 4 x i64>, <vscale x 4 x i64>} %retval
 }
 
@@ -171,17 +171,17 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_load_nxv8i6
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 16 x i64>, ptr %p
-  %retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
+  %retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
   ret {<vscale x 8 x i64>, <vscale x 8 x i64>} %retval
 }
 
-declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
-declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
-declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
-declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
-declare {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
-declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
+declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
+declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
+declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
+declare {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
+declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
 
 ; Floats
 
@@ -192,7 +192,7 @@ define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_load_nxv2
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 4 x half>, ptr %p
-  %retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
+  %retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
   ret {<vscale x 2 x half>, <vscale x 2 x half>} %retval
 }
 
@@ -203,7 +203,7 @@ define {<vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_load_nxv4
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 8 x half>, ptr %p
-  %retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
+  %retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
   ret {<vscale x 4 x half>, <vscale x 4 x half>} %retval
 }
 
@@ -214,7 +214,7 @@ define {<vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_load_nx
 ; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 4 x float>, ptr %p
-  %retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
+  %retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
   ret {<vscale x 2 x float>, <vscale x 2 x float>} %retval
 }
 
@@ -225,7 +225,7 @@ define {<vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_load_nxv8
 ; CHECK-NEXT:    vlseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 16 x half>, ptr %p
-  %retval = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %vec)
+  %retval = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %vec)
   ret {<vscale x 8 x half>, <vscale x 8 x half>} %retval
 }
 
@@ -236,7 +236,7 @@ define {<vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_load_nx
 ; CHECK-NEXT:    vlseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 8 x float>, ptr %p
-  %retval = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %vec)
+  %retval = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %vec)
   ret  {<vscale x 4 x float>, <vscale x 4 x float>} %retval
 }
 
@@ -247,13 +247,13 @@ define {<vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_load_
 ; CHECK-NEXT:    vlseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 4 x double>, ptr %p
-  %retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
+  %retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
   ret {<vscale x 2 x double>, <vscale x 2 x double>} %retval
 }
 
-declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
-declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
-declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
-declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
-declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
-declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
+declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
+declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
+declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
+declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
+declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index d98597fabcd9..7797577362c9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -21,7 +21,7 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv
 ; CHECK-NEXT:    vnsrl.wi v10, v12, 8
 ; CHECK-NEXT:    vmsne.vi v9, v10, 0
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
+%retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
 ret {<vscale x 16 x i1>, <vscale x 16 x i1>} %retval
 }
 
@@ -34,7 +34,7 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    vmv.v.v v10, v14
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
+%retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
 ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %retval
 }
 
@@ -47,7 +47,7 @@ define {<vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    vmv.v.v v10, v14
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
+%retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
 ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %retval
 }
 
@@ -61,7 +61,7 @@ define {<vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxv
 ; CHECK-NEXT:    vmv.v.v v8, v14
 ; CHECK-NEXT:    vmv.v.v v10, v12
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
+%retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
 ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %retval
 }
 
@@ -77,15 +77,15 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv
 ; CHECK-NEXT:    vmv2r.v v8, v12
 ; CHECK-NEXT:    vmv2r.v v10, v20
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
+%retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
 ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %retval
 }
 
-declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
-declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
-declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
-declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
+declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
+declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
+declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
 
 define {<vscale x 64 x i1>, <vscale x 64 x i1>} @vector_deinterleave_nxv64i1_nxv128i1(<vscale x 128 x i1> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv64i1_nxv128i1:
@@ -110,7 +110,7 @@ define {<vscale x 64 x i1>, <vscale x 64 x i1>} @vector_deinterleave_nxv64i1_nxv
 ; CHECK-NEXT:    vmsne.vi v9, v24, 0
 ; CHECK-NEXT:    vmv1r.v v8, v7
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 64 x i1>, <vscale x 64 x i1>} @llvm.experimental.vector.deinterleave2.nxv128i1(<vscale x 128 x i1> %vec)
+%retval = call {<vscale x 64 x i1>, <vscale x 64 x i1>} @llvm.vector.deinterleave2.nxv128i1(<vscale x 128 x i1> %vec)
 ret {<vscale x 64 x i1>, <vscale x 64 x i1>} %retval
 }
 
@@ -125,7 +125,7 @@ define {<vscale x 64 x i8>, <vscale x 64 x i8>} @vector_deinterleave_nxv64i8_nxv
 ; CHECK-NEXT:    vnsrl.wi v4, v16, 8
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 64 x i8>, <vscale x 64 x i8>} @llvm.experimental.vector.deinterleave2.nxv128i8(<vscale x 128 x i8> %vec)
+%retval = call {<vscale x 64 x i8>, <vscale x 64 x i8>} @llvm.vector.deinterleave2.nxv128i8(<vscale x 128 x i8> %vec)
 ret {<vscale x 64 x i8>, <vscale x 64 x i8>} %retval
 }
 
@@ -140,7 +140,7 @@ define {<vscale x 32 x i16>, <vscale x 32 x i16>} @vector_deinterleave_nxv32i16_
 ; CHECK-NEXT:    vnsrl.wi v4, v16, 16
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 32 x i16>, <vscale x 32 x i16>} @llvm.experimental.vector.deinterleave2.nxv64i16(<vscale x 64 x i16> %vec)
+%retval = call {<vscale x 32 x i16>, <vscale x 32 x i16>} @llvm.vector.deinterleave2.nxv64i16(<vscale x 64 x i16> %vec)
 ret {<vscale x 32 x i16>, <vscale x 32 x i16>} %retval
 }
 
@@ -156,7 +156,7 @@ define {<vscale x 16 x i32>, <vscale x 16 x i32>} @vector_deinterleave_nxv16i32_
 ; CHECK-NEXT:    vnsrl.wi v4, v24, 0
 ; CHECK-NEXT:    vmv8r.v v8, v0
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 16 x i32>, <vscale x 16 x i32>} @llvm.experimental.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %vec)
+%retval = call {<vscale x 16 x i32>, <vscale x 16 x i32>} @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %vec)
 ret {<vscale x 16 x i32>, <vscale x 16 x i32>} %retval
 }
 
@@ -229,15 +229,15 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_nxv8i64_nxv
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
+%retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
 ret {<vscale x 8 x i64>, <vscale x 8 x i64>} %retval
 }
 
-declare {<vscale x 64 x i1>, <vscale x 64 x i1>} @llvm.experimental.vector.deinterleave2.nxv128i1(<vscale x 128 x i1>)
-declare {<vscale x 64 x i8>, <vscale x 64 x i8>} @llvm.experimental.vector.deinterleave2.nxv128i8(<vscale x 128 x i8>)
-declare {<vscale x 32 x i16>, <vscale x 32 x i16>} @llvm.experimental.vector.deinterleave2.nxv64i16(<vscale x 64 x i16>)
-declare {<vscale x 16 x i32>, <vscale x 16 x i32>} @llvm.experimental.vector.deinterleave2.nxv32i32(<vscale x 32 x i32>)
-declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.experimental.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
+declare {<vscale x 64 x i1>, <vscale x 64 x i1>} @llvm.vector.deinterleave2.nxv128i1(<vscale x 128 x i1>)
+declare {<vscale x 64 x i8>, <vscale x 64 x i8>} @llvm.vector.deinterleave2.nxv128i8(<vscale x 128 x i8>)
+declare {<vscale x 32 x i16>, <vscale x 32 x i16>} @llvm.vector.deinterleave2.nxv64i16(<vscale x 64 x i16>)
+declare {<vscale x 16 x i32>, <vscale x 16 x i32>} @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32>)
+declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
 
 ; Floats
 
@@ -249,7 +249,7 @@ define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_n
 ; CHECK-NEXT:    vnsrl.wi v9, v8, 16
 ; CHECK-NEXT:    vmv1r.v v8, v10
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
+%retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
 ret {<vscale x 2 x half>, <vscale x 2 x half>} %retval
 }
 
@@ -262,7 +262,7 @@ define {<vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_n
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    vmv.v.v v9, v11
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
+%retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
 ret {<vscale x 4 x half>, <vscale x 4 x half>} %retval
 }
 
@@ -276,7 +276,7 @@ define {<vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32
 ; CHECK-NEXT:    vmv.v.v v8, v11
 ; CHECK-NEXT:    vmv.v.v v9, v10
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
+%retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
 ret {<vscale x 2 x float>, <vscale x 2 x float>} %retval
 }
 
@@ -289,7 +289,7 @@ define {<vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_n
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    vmv.v.v v10, v14
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %vec)
+%retval = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %vec)
 ret {<vscale x 8 x half>, <vscale x 8 x half>} %retval
 }
 
@@ -303,7 +303,7 @@ define {<vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32
 ; CHECK-NEXT:    vmv.v.v v8, v14
 ; CHECK-NEXT:    vmv.v.v v10, v12
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %vec)
+%retval = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %vec)
 ret  {<vscale x 4 x float>, <vscale x 4 x float>} %retval
 }
 
@@ -319,16 +319,16 @@ define {<vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f
 ; CHECK-NEXT:    vmv2r.v v8, v12
 ; CHECK-NEXT:    vmv2r.v v10, v20
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
+%retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
 ret {<vscale x 2 x double>, <vscale x 2 x double>} %retval
 }
 
-declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
-declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
-declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
-declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
-declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
-declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
+declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
+declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
+declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
+declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
+declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
 
 define {<vscale x 32 x half>, <vscale x 32 x half>} @vector_deinterleave_nxv32f16_nxv64f16(<vscale x 64 x half> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv32f16_nxv64f16:
@@ -341,7 +341,7 @@ define {<vscale x 32 x half>, <vscale x 32 x half>} @vector_deinterleave_nxv32f1
 ; CHECK-NEXT:    vnsrl.wi v4, v16, 16
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 32 x half>, <vscale x 32 x half>} @llvm.experimental.vector.deinterleave2.nxv64f16(<vscale x 64 x half> %vec)
+%retval = call {<vscale x 32 x half>, <vscale x 32 x half>} @llvm.vector.deinterleave2.nxv64f16(<vscale x 64 x half> %vec)
 ret {<vscale x 32 x half>, <vscale x 32 x half>} %retval
 }
 
@@ -357,7 +357,7 @@ define {<vscale x 16 x float>, <vscale x 16 x float>} @vector_deinterleave_nxv16
 ; CHECK-NEXT:    vnsrl.wi v4, v24, 0
 ; CHECK-NEXT:    vmv8r.v v8, v0
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 16 x float>, <vscale x 16 x float>} @llvm.experimental.vector.deinterleave2.nxv32f32(<vscale x 32 x float> %vec)
+%retval = call {<vscale x 16 x float>, <vscale x 16 x float>} @llvm.vector.deinterleave2.nxv32f32(<vscale x 32 x float> %vec)
 ret  {<vscale x 16 x float>, <vscale x 16 x float>} %retval
 }
 
@@ -430,10 +430,10 @@ define {<vscale x 8 x double>, <vscale x 8 x double>} @vector_deinterleave_nxv8f
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-%retval = call {<vscale x 8 x double>, <vscale x 8 x double>} @llvm.experimental.vector.deinterleave2.nxv16f64(<vscale x 16 x double> %vec)
+%retval = call {<vscale x 8 x double>, <vscale x 8 x double>} @llvm.vector.deinterleave2.nxv16f64(<vscale x 16 x double> %vec)
 ret {<vscale x 8 x double>, <vscale x 8 x double>} %retval
 }
 
-declare {<vscale x 32 x half>, <vscale x 32 x half>} @llvm.experimental.vector.deinterleave2.nxv64f16(<vscale x 64 x half>)
-declare {<vscale x 16 x float>, <vscale x 16 x float>} @llvm.experimental.vector.deinterleave2.nxv32f32(<vscale x 32 x float>)
-declare {<vscale x 8 x double>, <vscale x 8 x double>} @llvm.experimental.vector.deinterleave2.nxv16f64(<vscale x 16 x double>)
+declare {<vscale x 32 x half>, <vscale x 32 x half>} @llvm.vector.deinterleave2.nxv64f16(<vscale x 64 x half>)
+declare {<vscale x 16 x float>, <vscale x 16 x float>} @llvm.vector.deinterleave2.nxv32f32(<vscale x 32 x float>)
+declare {<vscale x 8 x double>, <vscale x 8 x double>} @llvm.vector.deinterleave2.nxv16f64(<vscale x 16 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
index 6ebe8e095469..99872c199a1e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
@@ -41,7 +41,7 @@ define <32 x i1> @vector_interleave_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b) {
 ; ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; ZVBB-NEXT:    vmsne.vi v0, v12, 0
 ; ZVBB-NEXT:    ret
-	   %res = call <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1> %a, <16 x i1> %b)
+	   %res = call <32 x i1> @llvm.vector.interleave2.v32i1(<16 x i1> %a, <16 x i1> %b)
 	   ret <32 x i1> %res
 }
 
@@ -62,7 +62,7 @@ define <16 x i16> @vector_interleave_v16i16_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv2r.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b)
+	   %res = call <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b)
 	   ret <16 x i16> %res
 }
 
@@ -84,7 +84,7 @@ define <8 x i32> @vector_interleave_v8i32_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv2r.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b)
+	   %res = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b)
 	   ret <8 x i32> %res
 }
 
@@ -118,14 +118,14 @@ define <4 x i64> @vector_interleave_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; ZVBB-NEXT:    vrgatherei16.vv v10, v8, v12
 ; ZVBB-NEXT:    vmv.v.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %a, <2 x i64> %b)
+	   %res = call <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64> %a, <2 x i64> %b)
 	   ret <4 x i64> %res
 }
 
-declare <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1>, <16 x i1>)
-declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
-declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
-declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
+declare <32 x i1> @llvm.vector.interleave2.v32i1(<16 x i1>, <16 x i1>)
+declare <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
+declare <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
 
 ; Floats
 
@@ -146,7 +146,7 @@ define <4 x half> @vector_interleave_v4f16_v2f16(<2 x half> %a, <2 x half> %b) {
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv1r.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %a, <2 x half> %b)
+	   %res = call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %a, <2 x half> %b)
 	   ret <4 x half> %res
 }
 
@@ -167,7 +167,7 @@ define <8 x half> @vector_interleave_v8f16_v4f16(<4 x half> %a, <4 x half> %b) {
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv1r.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %a, <4 x half> %b)
+	   %res = call <8 x half> @llvm.vector.interleave2.v8f16(<4 x half> %a, <4 x half> %b)
 	   ret <8 x half> %res
 }
 
@@ -189,7 +189,7 @@ define <4 x float> @vector_interleave_v4f32_v2f32(<2 x float> %a, <2 x float> %b
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv1r.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %a, <2 x float> %b)
+	   %res = call <4 x float> @llvm.vector.interleave2.v4f32(<2 x float> %a, <2 x float> %b)
 	   ret <4 x float> %res
 }
 
@@ -210,7 +210,7 @@ define <16 x half> @vector_interleave_v16f16_v8f16(<8 x half> %a, <8 x half> %b)
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv2r.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b)
+	   %res = call <16 x half> @llvm.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b)
 	   ret <16 x half> %res
 }
 
@@ -232,7 +232,7 @@ define <8 x float> @vector_interleave_v8f32_v4f32(<4 x float> %a, <4 x float> %b
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv2r.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
+	   %res = call <8 x float> @llvm.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
 	   ret <8 x float> %res
 }
 
@@ -266,17 +266,17 @@ define <4 x double> @vector_interleave_v4f64_v2f64(<2 x double> %a, <2 x double>
 ; ZVBB-NEXT:    vrgatherei16.vv v10, v8, v12
 ; ZVBB-NEXT:    vmv.v.v v8, v10
 ; ZVBB-NEXT:    ret
-	   %res = call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b)
+	   %res = call <4 x double> @llvm.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b)
 	   ret <4 x double> %res
 }
 
 
-declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>)
-declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>)
-declare <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float>, <2 x float>)
-declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>)
-declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>)
-declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>)
+declare <4 x half> @llvm.vector.interleave2.v4f16(<2 x half>, <2 x half>)
+declare <8 x half> @llvm.vector.interleave2.v8f16(<4 x half>, <4 x half>)
+declare <4 x float> @llvm.vector.interleave2.v4f32(<2 x float>, <2 x float>)
+declare <16 x half> @llvm.vector.interleave2.v16f16(<8 x half>, <8 x half>)
+declare <8 x float> @llvm.vector.interleave2.v8f32(<4 x float>, <4 x float>)
+declare <4 x double> @llvm.vector.interleave2.v4f64(<2 x double>, <2 x double>)
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; RV32: {{.*}}
 ; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
index 922692ed88c9..7ade47e60bc6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
@@ -27,7 +27,7 @@ define void @vector_interleave_store_nxv32i1_nxv16i1(<vscale x 16 x i1> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vsm.v v9, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
+  %res = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
   store <vscale x 32 x i1> %res, ptr %p
   ret void
 }
@@ -42,7 +42,7 @@ define void @vector_interleave_store_nxv16i16_nxv8i16_align1(<vscale x 8 x i16>
 ; CHECK-NEXT:    vwmaccu.vx v12, a1, v10
 ; CHECK-NEXT:    vs4r.v v12, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
+  %res = call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
   store <vscale x 16 x i16> %res, ptr %p, align 1
   ret void
 }
@@ -53,7 +53,7 @@ define void @vector_interleave_store_nxv16i16_nxv8i16(<vscale x 8 x i16> %a, <vs
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
+  %res = call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
   store <vscale x 16 x i16> %res, ptr %p
   ret void
 }
@@ -64,7 +64,7 @@ define void @vector_interleave_store_nxv8i32_nxv4i32(<vscale x 4 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vsseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  %res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
   store <vscale x 8 x i32> %res, ptr %p
   ret void
 }
@@ -75,7 +75,7 @@ define void @vector_interleave_store_nxv4i64_nxv2i64(<vscale x 2 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vsseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
+  %res = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
   store <vscale x 4 x i64> %res, ptr %p
   ret void
 }
@@ -86,7 +86,7 @@ define void @vector_interleave_store_nxv8i64_nxv4i64(<vscale x 4 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vsseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b)
+  %res = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b)
   store <vscale x 8 x i64> %res, ptr %p
   ret void
 }
@@ -138,17 +138,17 @@ define void @vector_interleave_store_nxv16i64_nxv8i64(<vscale x 8 x i64> %a, <vs
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i64> @llvm.experimental.vector.interleave2.nxv16i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b)
+  %res = call <vscale x 16 x i64> @llvm.vector.interleave2.nxv16i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b)
   store <vscale x 16 x i64> %res, ptr %p
   ret void
 }
 
-declare <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
-declare <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
-declare <vscale x 16 x i64> @llvm.experimental.vector.interleave2.nxv16i64(<vscale x 8 x i64>, <vscale x 8 x i64>)
+declare <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64>, <vscale x 4 x i64>)
+declare <vscale x 16 x i64> @llvm.vector.interleave2.nxv16i64(<vscale x 8 x i64>, <vscale x 8 x i64>)
 
 ; Floats
 
@@ -158,7 +158,7 @@ define void @vector_interleave_store_nxv4f16_nxv2f16(<vscale x 2 x half> %a, <vs
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
+  %res = call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
   store <vscale x 4 x half> %res, ptr %p
   ret void
 }
@@ -169,7 +169,7 @@ define void @vector_interleave_store_nxv8f16_nxv4f16(<vscale x 4 x half> %a, <vs
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
+  %res = call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
   store <vscale x 8 x half> %res, ptr %p
   ret void
 }
@@ -180,7 +180,7 @@ define void @vector_interleave_store_nxv4f32_nxv2f32(<vscale x 2 x float> %a, <v
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vsseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
+  %res = call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
   store <vscale x 4 x float> %res, ptr %p
   ret void
 }
@@ -191,7 +191,7 @@ define void @vector_interleave_store_nxv16f16_nxv8f16(<vscale x 8 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vsseg2e16.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  %res = call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
   store <vscale x 16 x half> %res, ptr %p
   ret void
 }
@@ -202,7 +202,7 @@ define void @vector_interleave_store_nxv8f32_nxv4f32(<vscale x 4 x float> %a, <v
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vsseg2e32.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  %res = call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
   store <vscale x 8 x float> %res, ptr %p
   ret void
 }
@@ -213,15 +213,15 @@ define void @vector_interleave_store_nxv4f64_nxv2f64(<vscale x 2 x double> %a, <
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vsseg2e64.v v8, (a0)
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  %res = call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
   store <vscale x 4 x double> %res, ptr %p
   ret void
 }
 
 
-declare <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
-declare <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
+declare <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
+declare <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
+declare <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index 327e18e91381..a7e0ad6ee5f4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -47,7 +47,7 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v0, v8, a0
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
+  %res = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
   ret <vscale x 32 x i1> %res
 }
 
@@ -68,7 +68,7 @@ define <vscale x 32 x i8> @vector_interleave_nxv32i8_nxv16i8(<vscale x 16 x i8>
 ; ZVBB-NEXT:    vwaddu.wv v12, v12, v8
 ; ZVBB-NEXT:    vmv4r.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+  %res = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
   ret <vscale x 32 x i8> %res
 }
 
@@ -89,7 +89,7 @@ define <vscale x 16 x i16> @vector_interleave_nxv16i16_nxv8i16(<vscale x 8 x i16
 ; ZVBB-NEXT:    vwaddu.wv v12, v12, v8
 ; ZVBB-NEXT:    vmv4r.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
+  %res = call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
   ret <vscale x 16 x i16> %res
 }
 
@@ -111,7 +111,7 @@ define <vscale x 8 x i32> @vector_interleave_nxv8i32_nxv4i32(<vscale x 4 x i32>
 ; ZVBB-NEXT:    vwaddu.wv v12, v12, v8
 ; ZVBB-NEXT:    vmv4r.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  %res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
   ret <vscale x 8 x i32> %res
 }
 
@@ -145,15 +145,15 @@ define <vscale x 4 x i64> @vector_interleave_nxv4i64_nxv2i64(<vscale x 2 x i64>
 ; ZVBB-NEXT:    vrgatherei16.vv v12, v8, v16
 ; ZVBB-NEXT:    vmv.v.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
+  %res = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
   ret <vscale x 4 x i64> %res
 }
 
-declare <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
-declare <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
 
 define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv64i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b) {
 ; CHECK-LABEL: vector_interleave_nxv128i1_nxv64i1:
@@ -196,7 +196,7 @@ define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv64i1(<vscale x 64 x i1
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; ZVBB-NEXT:    vmsne.vi v8, v24, 0
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 128 x i1> @llvm.experimental.vector.interleave2.nxv128i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b)
+  %res = call <vscale x 128 x i1> @llvm.vector.interleave2.nxv128i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b)
   ret <vscale x 128 x i1> %res
 }
 
@@ -223,7 +223,7 @@ define <vscale x 128 x i8> @vector_interleave_nxv128i8_nxv64i8(<vscale x 64 x i8
 ; ZVBB-NEXT:    vwaddu.wv v0, v0, v28
 ; ZVBB-NEXT:    vmv8r.v v16, v0
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 128 x i8> @llvm.experimental.vector.interleave2.nxv128i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b)
+  %res = call <vscale x 128 x i8> @llvm.vector.interleave2.nxv128i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b)
   ret <vscale x 128 x i8> %res
 }
 
@@ -250,7 +250,7 @@ define <vscale x 64 x i16> @vector_interleave_nxv64i16_nxv32i16(<vscale x 32 x i
 ; ZVBB-NEXT:    vwaddu.wv v0, v0, v28
 ; ZVBB-NEXT:    vmv8r.v v16, v0
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 64 x i16> @llvm.experimental.vector.interleave2.nxv64i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b)
+  %res = call <vscale x 64 x i16> @llvm.vector.interleave2.nxv64i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b)
   ret <vscale x 64 x i16> %res
 }
 
@@ -278,7 +278,7 @@ define <vscale x 32 x i32> @vector_interleave_nxv32i32_nxv16i32(<vscale x 16 x i
 ; ZVBB-NEXT:    vmv8r.v v8, v24
 ; ZVBB-NEXT:    vmv8r.v v16, v0
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 32 x i32> @llvm.experimental.vector.interleave2.nxv32i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b)
+  %res = call <vscale x 32 x i32> @llvm.vector.interleave2.nxv32i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b)
   ret <vscale x 32 x i32> %res
 }
 
@@ -376,15 +376,15 @@ define <vscale x 16 x i64> @vector_interleave_nxv16i64_nxv8i64(<vscale x 8 x i64
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 16 x i64> @llvm.experimental.vector.interleave2.nxv16i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b)
+  %res = call <vscale x 16 x i64> @llvm.vector.interleave2.nxv16i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b)
   ret <vscale x 16 x i64> %res
 }
 
-declare <vscale x 128 x i1> @llvm.experimental.vector.interleave2.nxv128i1(<vscale x 64 x i1>, <vscale x 64 x i1>)
-declare <vscale x 128 x i8> @llvm.experimental.vector.interleave2.nxv128i8(<vscale x 64 x i8>, <vscale x 64 x i8>)
-declare <vscale x 64 x i16> @llvm.experimental.vector.interleave2.nxv64i16(<vscale x 32 x i16>, <vscale x 32 x i16>)
-declare <vscale x 32 x i32> @llvm.experimental.vector.interleave2.nxv32i32(<vscale x 16 x i32>, <vscale x 16 x i32>)
-declare <vscale x 16 x i64> @llvm.experimental.vector.interleave2.nxv16i64(<vscale x 8 x i64>, <vscale x 8 x i64>)
+declare <vscale x 128 x i1> @llvm.vector.interleave2.nxv128i1(<vscale x 64 x i1>, <vscale x 64 x i1>)
+declare <vscale x 128 x i8> @llvm.vector.interleave2.nxv128i8(<vscale x 64 x i8>, <vscale x 64 x i8>)
+declare <vscale x 64 x i16> @llvm.vector.interleave2.nxv64i16(<vscale x 32 x i16>, <vscale x 32 x i16>)
+declare <vscale x 32 x i32> @llvm.vector.interleave2.nxv32i32(<vscale x 16 x i32>, <vscale x 16 x i32>)
+declare <vscale x 16 x i64> @llvm.vector.interleave2.nxv16i64(<vscale x 8 x i64>, <vscale x 8 x i64>)
 
 ; Floats
 
@@ -419,7 +419,7 @@ define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a0
 ; ZVBB-NEXT:    vmv.v.v v8, v10
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
+  %res = call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
   ret <vscale x 4 x half> %res
 }
 
@@ -440,7 +440,7 @@ define <vscale x 8 x half> @vector_interleave_nxv8f16_nxv4f16(<vscale x 4 x half
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv2r.v v8, v10
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
+  %res = call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
   ret <vscale x 8 x half> %res
 }
 
@@ -462,7 +462,7 @@ define <vscale x 4 x float> @vector_interleave_nxv4f32_nxv2f32(<vscale x 2 x flo
 ; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    vmv2r.v v8, v10
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
+  %res = call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
   ret <vscale x 4 x float> %res
 }
 
@@ -483,7 +483,7 @@ define <vscale x 16 x half> @vector_interleave_nxv16f16_nxv8f16(<vscale x 8 x ha
 ; ZVBB-NEXT:    vwaddu.wv v12, v12, v8
 ; ZVBB-NEXT:    vmv4r.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  %res = call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
   ret <vscale x 16 x half> %res
 }
 
@@ -505,7 +505,7 @@ define <vscale x 8 x float> @vector_interleave_nxv8f32_nxv4f32(<vscale x 4 x flo
 ; ZVBB-NEXT:    vwaddu.wv v12, v12, v8
 ; ZVBB-NEXT:    vmv4r.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  %res = call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
   ret <vscale x 8 x float> %res
 }
 
@@ -539,17 +539,17 @@ define <vscale x 4 x double> @vector_interleave_nxv4f64_nxv2f64(<vscale x 2 x do
 ; ZVBB-NEXT:    vrgatherei16.vv v12, v8, v16
 ; ZVBB-NEXT:    vmv.v.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  %res = call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
   ret <vscale x 4 x double> %res
 }
 
 
-declare <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
-declare <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
-declare <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
+declare <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
+declare <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
+declare <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
 
 define <vscale x 64 x half> @vector_interleave_nxv64f16_nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b) {
 ; CHECK-LABEL: vector_interleave_nxv64f16_nxv32f16:
@@ -574,7 +574,7 @@ define <vscale x 64 x half> @vector_interleave_nxv64f16_nxv32f16(<vscale x 32 x
 ; ZVBB-NEXT:    vwaddu.wv v0, v0, v28
 ; ZVBB-NEXT:    vmv8r.v v16, v0
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 64 x half> @llvm.experimental.vector.interleave2.nxv64f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b)
+  %res = call <vscale x 64 x half> @llvm.vector.interleave2.nxv64f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b)
   ret <vscale x 64 x half> %res
 }
 
@@ -602,7 +602,7 @@ define <vscale x 32 x float> @vector_interleave_nxv32f32_nxv16f32(<vscale x 16 x
 ; ZVBB-NEXT:    vmv8r.v v8, v24
 ; ZVBB-NEXT:    vmv8r.v v16, v0
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 32 x float> @llvm.experimental.vector.interleave2.nxv32f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b)
+  %res = call <vscale x 32 x float> @llvm.vector.interleave2.nxv32f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b)
   ret <vscale x 32 x float> %res
 }
 
@@ -700,7 +700,7 @@ define <vscale x 16 x double> @vector_interleave_nxv16f64_nxv8f64(<vscale x 8 x
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 16 x double> @llvm.experimental.vector.interleave2.nxv16f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b)
+  %res = call <vscale x 16 x double> @llvm.vector.interleave2.nxv16f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b)
   ret <vscale x 16 x double> %res
 }
 
@@ -718,7 +718,7 @@ define <vscale x 8 x i32> @vector_interleave_nxv8i32_nxv4i32_poison(<vscale x 4
 ; ZVBB-NEXT:    vzext.vf2 v12, v8
 ; ZVBB-NEXT:    vmv.v.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> poison)
+  %res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> poison)
   ret <vscale x 8 x i32> %res
 }
 
@@ -738,10 +738,10 @@ define <vscale x 8 x i32> @vector_interleave_nxv8i32_nxv4i32_poison2(<vscale x 4
 ; ZVBB-NEXT:    vwsll.vx v12, v8, a0
 ; ZVBB-NEXT:    vmv4r.v v8, v12
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a)
+  %res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a)
   ret <vscale x 8 x i32> %res
 }
 
-declare <vscale x 64 x half> @llvm.experimental.vector.interleave2.nxv64f16(<vscale x 32 x half>, <vscale x 32 x half>)
-declare <vscale x 32 x float> @llvm.experimental.vector.interleave2.nxv32f32(<vscale x 16 x float>, <vscale x 16 x float>)
-declare <vscale x 16 x double> @llvm.experimental.vector.interleave2.nxv16f64(<vscale x 8 x double>, <vscale x 8 x double>)
+declare <vscale x 64 x half> @llvm.vector.interleave2.nxv64f16(<vscale x 32 x half>, <vscale x 32 x half>)
+declare <vscale x 32 x float> @llvm.vector.interleave2.nxv32f32(<vscale x 16 x float>, <vscale x 16 x float>)
+declare <vscale x 16 x double> @llvm.vector.interleave2.nxv16f64(<vscale x 8 x double>, <vscale x 8 x double>)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-reassociations.ll b/llvm/test/CodeGen/RISCV/rvv/vector-reassociations.ll
index 3cb6f3c35286..6435c1c14e06 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-reassociations.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-reassociations.ll
@@ -31,7 +31,7 @@ define <vscale x 1 x i8> @simple_vadd_vv(<vscale x 1 x i8> %0, <vscale x 1 x i8>
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vadd.vv v9, v8, v9
-; CHECK-NEXT:    vadd.vv v9, v8, v9
+; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -61,7 +61,7 @@ define <vscale x 1 x i8> @simple_vadd_vsub_vv(<vscale x 1 x i8> %0, <vscale x 1
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsub.vv v9, v8, v9
-; CHECK-NEXT:    vadd.vv v9, v8, v9
+; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -91,7 +91,7 @@ define <vscale x 1 x i8> @simple_vmul_vv(<vscale x 1 x i8> %0, <vscale x 1 x i8>
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmul.vv v9, v8, v9
-; CHECK-NEXT:    vmul.vv v9, v8, v9
+; CHECK-NEXT:    vmul.vv v8, v8, v8
 ; CHECK-NEXT:    vmul.vv v8, v8, v9
 ; CHECK-NEXT:    ret
 entry:
@@ -124,8 +124,8 @@ define <vscale x 1 x i8> @vadd_vv_passthru(<vscale x 1 x i8> %0, <vscale x 1 x i
 ; CHECK-NEXT:    vmv1r.v v10, v8
 ; CHECK-NEXT:    vadd.vv v10, v8, v9
 ; CHECK-NEXT:    vmv1r.v v9, v8
-; CHECK-NEXT:    vadd.vv v9, v8, v10
-; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vadd.vv v9, v8, v8
+; CHECK-NEXT:    vadd.vv v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vadd.nxv1i8.nxv1i8(
@@ -187,8 +187,8 @@ define <vscale x 1 x i8> @vadd_vv_mask(<vscale x 1 x i8> %0, <vscale x 1 x i8> %
 ; CHECK-NEXT:    vmv1r.v v10, v8
 ; CHECK-NEXT:    vadd.vv v10, v8, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v9, v8
-; CHECK-NEXT:    vadd.vv v9, v8, v10, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vadd.vv v9, v8, v8, v0.t
+; CHECK-NEXT:    vadd.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vadd.mask.nxv1i8.nxv1i8(
@@ -215,15 +215,16 @@ entry:
   ret <vscale x 1 x i8> %c
 }
 
-define <vscale x 1 x i8> @vadd_vv_mask_negative(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2, <vscale x 1 x i1> %m) nounwind {
+define <vscale x 1 x i8> @vadd_vv_mask_negative(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i32 %2, <vscale x 1 x i1> %m, <vscale x 1 x i1> %m2) nounwind {
 ; CHECK-LABEL: vadd_vv_mask_negative:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
-; CHECK-NEXT:    vmv1r.v v10, v8
-; CHECK-NEXT:    vadd.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    vadd.vv v11, v8, v9, v0.t
 ; CHECK-NEXT:    vmv1r.v v9, v8
-; CHECK-NEXT:    vadd.vv v9, v8, v10, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vadd.vv v9, v8, v11, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 1 x i8> @llvm.riscv.vadd.mask.nxv1i8.nxv1i8(
@@ -240,8 +241,6 @@ entry:
     <vscale x 1 x i1> %m,
     i32 %2, i32 1)
 
-  %splat = insertelement <vscale x 1 x i1> poison, i1 1, i32 0
-  %m2 = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %c = call <vscale x 1 x i8> @llvm.riscv.vadd.mask.nxv1i8.nxv1i8(
     <vscale x 1 x i8> %0,
     <vscale x 1 x i8> %0,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
index c98242437f62..be56db52e349 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
@@ -4,7 +4,7 @@
 
 ; Tests assume VLEN=128 or vscale_range_min=2.
 
-declare <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1>, <vscale x 1 x i1>, i32)
+declare <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1>, <vscale x 1 x i1>, i32)
 
 define <vscale x 1 x i1> @splice_nxv1i1_offset_negone(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv1i1_offset_negone:
@@ -24,7 +24,7 @@ define <vscale x 1 x i1> @splice_nxv1i1_offset_negone(<vscale x 1 x i1> %a, <vsc
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, i32 -1)
+  %res = call <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, i32 -1)
   ret <vscale x 1 x i1> %res
 }
 
@@ -48,11 +48,11 @@ define <vscale x 1 x i1> @splice_nxv1i1_offset_max(<vscale x 1 x i1> %a, <vscale
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i1> @llvm.experimental.vector.splice.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, i32 1)
+  %res = call <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, i32 1)
   ret <vscale x 1 x i1> %res
 }
 
-declare <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
 
 define <vscale x 2 x i1> @splice_nxv2i1_offset_negone(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv2i1_offset_negone:
@@ -72,7 +72,7 @@ define <vscale x 2 x i1> @splice_nxv2i1_offset_negone(<vscale x 2 x i1> %a, <vsc
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 -1)
+  %res = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 -1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -96,11 +96,11 @@ define <vscale x 2 x i1> @splice_nxv2i1_offset_max(<vscale x 2 x i1> %a, <vscale
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 3)
+  %res = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 3)
   ret <vscale x 2 x i1> %res
 }
 
-declare <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
 
 define <vscale x 4 x i1> @splice_nxv4i1_offset_negone(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv4i1_offset_negone:
@@ -120,7 +120,7 @@ define <vscale x 4 x i1> @splice_nxv4i1_offset_negone(<vscale x 4 x i1> %a, <vsc
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 -1)
+  %res = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 -1)
   ret <vscale x 4 x i1> %res
 }
 
@@ -144,11 +144,11 @@ define <vscale x 4 x i1> @splice_nxv4i1_offset_max(<vscale x 4 x i1> %a, <vscale
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 7)
+  %res = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 7)
   ret <vscale x 4 x i1> %res
 }
 
-declare <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
+declare <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
 
 define <vscale x 8 x i1> @splice_nxv8i1_offset_negone(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv8i1_offset_negone:
@@ -167,7 +167,7 @@ define <vscale x 8 x i1> @splice_nxv8i1_offset_negone(<vscale x 8 x i1> %a, <vsc
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 -1)
+  %res = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 -1)
   ret <vscale x 8 x i1> %res
 }
 
@@ -190,11 +190,11 @@ define <vscale x 8 x i1> @splice_nxv8i1_offset_max(<vscale x 8 x i1> %a, <vscale
 ; CHECK-NEXT:    vand.vi v8, v9, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 15)
+  %res = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 15)
   ret <vscale x 8 x i1> %res
 }
 
-declare <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
+declare <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
 
 define <vscale x 16 x i1> @splice_nxv16i1_offset_negone(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv16i1_offset_negone:
@@ -216,7 +216,7 @@ define <vscale x 16 x i1> @splice_nxv16i1_offset_negone(<vscale x 16 x i1> %a, <
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 -1)
+  %res = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 -1)
   ret <vscale x 16 x i1> %res
 }
 
@@ -240,11 +240,11 @@ define <vscale x 16 x i1> @splice_nxv16i1_offset_max(<vscale x 16 x i1> %a, <vsc
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 31)
+  %res = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 31)
   ret <vscale x 16 x i1> %res
 }
 
-declare <vscale x 32 x i1> @llvm.experimental.vector.splice.nxv32i1(<vscale x 32 x i1>, <vscale x 32 x i1>, i32)
+declare <vscale x 32 x i1> @llvm.vector.splice.nxv32i1(<vscale x 32 x i1>, <vscale x 32 x i1>, i32)
 
 define <vscale x 32 x i1> @splice_nxv32i1_offset_negone(<vscale x 32 x i1> %a, <vscale x 32 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv32i1_offset_negone:
@@ -266,7 +266,7 @@ define <vscale x 32 x i1> @splice_nxv32i1_offset_negone(<vscale x 32 x i1> %a, <
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i1> @llvm.experimental.vector.splice.nxv32i1(<vscale x 32 x i1> %a, <vscale x 32 x i1> %b, i32 -1)
+  %res = call <vscale x 32 x i1> @llvm.vector.splice.nxv32i1(<vscale x 32 x i1> %a, <vscale x 32 x i1> %b, i32 -1)
   ret <vscale x 32 x i1> %res
 }
 
@@ -289,11 +289,11 @@ define <vscale x 32 x i1> @splice_nxv32i1_offset_max(<vscale x 32 x i1> %a, <vsc
 ; CHECK-NEXT:    vand.vi v8, v16, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i1> @llvm.experimental.vector.splice.nxv32i1(<vscale x 32 x i1> %a, <vscale x 32 x i1> %b, i32 63)
+  %res = call <vscale x 32 x i1> @llvm.vector.splice.nxv32i1(<vscale x 32 x i1> %a, <vscale x 32 x i1> %b, i32 63)
   ret <vscale x 32 x i1> %res
 }
 
-declare <vscale x 64 x i1> @llvm.experimental.vector.splice.nxv64i1(<vscale x 64 x i1>, <vscale x 64 x i1>, i32)
+declare <vscale x 64 x i1> @llvm.vector.splice.nxv64i1(<vscale x 64 x i1>, <vscale x 64 x i1>, i32)
 
 define <vscale x 64 x i1> @splice_nxv64i1_offset_negone(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b) #0 {
 ; CHECK-LABEL: splice_nxv64i1_offset_negone:
@@ -315,7 +315,7 @@ define <vscale x 64 x i1> @splice_nxv64i1_offset_negone(<vscale x 64 x i1> %a, <
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 64 x i1> @llvm.experimental.vector.splice.nxv64i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b, i32 -1)
+  %res = call <vscale x 64 x i1> @llvm.vector.splice.nxv64i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b, i32 -1)
   ret <vscale x 64 x i1> %res
 }
 
@@ -338,17 +338,17 @@ define <vscale x 64 x i1> @splice_nxv64i1_offset_max(<vscale x 64 x i1> %a, <vsc
 ; CHECK-NEXT:    vand.vi v8, v24, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 64 x i1> @llvm.experimental.vector.splice.nxv64i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b, i32 127)
+  %res = call <vscale x 64 x i1> @llvm.vector.splice.nxv64i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b, i32 127)
   ret <vscale x 64 x i1> %res
 }
 
-declare <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, i32)
+declare <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, i32)
 
 define <vscale x 1 x i8> @splice_nxv1i8_offset_zero(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b) #0 {
 ; CHECK-LABEL: splice_nxv1i8_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 0)
+  %res = call <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 0)
   ret <vscale x 1 x i8> %res
 }
 
@@ -363,7 +363,7 @@ define <vscale x 1 x i8> @splice_nxv1i8_offset_negone(<vscale x 1 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 -1)
+  %res = call <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 -1)
   ret <vscale x 1 x i8> %res
 }
 
@@ -378,7 +378,7 @@ define <vscale x 1 x i8> @splice_nxv1i8_offset_min(<vscale x 1 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 -2)
+  %res = call <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 -2)
   ret <vscale x 1 x i8> %res
 }
 
@@ -393,17 +393,17 @@ define <vscale x 1 x i8> @splice_nxv1i8_offset_max(<vscale x 1 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i8> @llvm.experimental.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 1)
+  %res = call <vscale x 1 x i8> @llvm.vector.splice.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, i32 1)
   ret <vscale x 1 x i8> %res
 }
 
-declare <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
+declare <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
 
 define <vscale x 2 x i8> @splice_nxv2i8_offset_zero(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b) #0 {
 ; CHECK-LABEL: splice_nxv2i8_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 0)
+  %res = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 0)
   ret <vscale x 2 x i8> %res
 }
 
@@ -418,7 +418,7 @@ define <vscale x 2 x i8> @splice_nxv2i8_offset_negone(<vscale x 2 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 -1)
+  %res = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 -1)
   ret <vscale x 2 x i8> %res
 }
 
@@ -433,7 +433,7 @@ define <vscale x 2 x i8> @splice_nxv2i8_offset_min(<vscale x 2 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 -4)
+  %res = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 -4)
   ret <vscale x 2 x i8> %res
 }
 
@@ -448,17 +448,17 @@ define <vscale x 2 x i8> @splice_nxv2i8_offset_max(<vscale x 2 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i8> @llvm.experimental.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 3)
+  %res = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 3)
   ret <vscale x 2 x i8> %res
 }
 
-declare <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, i32)
+declare <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, i32)
 
 define <vscale x 4 x i8> @splice_nxv4i8_offset_zero(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b) #0 {
 ; CHECK-LABEL: splice_nxv4i8_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 0)
+  %res = call <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 0)
   ret <vscale x 4 x i8> %res
 }
 
@@ -473,7 +473,7 @@ define <vscale x 4 x i8> @splice_nxv4i8_offset_negone(<vscale x 4 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 -1)
+  %res = call <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 -1)
   ret <vscale x 4 x i8> %res
 }
 
@@ -488,7 +488,7 @@ define <vscale x 4 x i8> @splice_nxv4i8_offset_min(<vscale x 4 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 -8)
+  %res = call <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 -8)
   ret <vscale x 4 x i8> %res
 }
 
@@ -503,17 +503,17 @@ define <vscale x 4 x i8> @splice_nxv4i8_offset_max(<vscale x 4 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i8> @llvm.experimental.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 7)
+  %res = call <vscale x 4 x i8> @llvm.vector.splice.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, i32 7)
   ret <vscale x 4 x i8> %res
 }
 
-declare <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, i32)
+declare <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, i32)
 
 define <vscale x 8 x i8> @splice_nxv8i8_offset_zero(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) #0 {
 ; CHECK-LABEL: splice_nxv8i8_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 0)
+  %res = call <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 0)
   ret <vscale x 8 x i8> %res
 }
 
@@ -527,7 +527,7 @@ define <vscale x 8 x i8> @splice_nxv8i8_offset_negone(<vscale x 8 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 -1)
+  %res = call <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 -1)
   ret <vscale x 8 x i8> %res
 }
 
@@ -541,7 +541,7 @@ define <vscale x 8 x i8> @splice_nxv8i8_offset_min(<vscale x 8 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 -16)
+  %res = call <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 -16)
   ret <vscale x 8 x i8> %res
 }
 
@@ -555,17 +555,17 @@ define <vscale x 8 x i8> @splice_nxv8i8_offset_max(<vscale x 8 x i8> %a, <vscale
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i8> @llvm.experimental.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 15)
+  %res = call <vscale x 8 x i8> @llvm.vector.splice.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, i32 15)
   ret <vscale x 8 x i8> %res
 }
 
-declare <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
+declare <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
 
 define <vscale x 16 x i8> @splice_nxv16i8_offset_zero(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
 ; CHECK-LABEL: splice_nxv16i8_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 0)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 0)
   ret <vscale x 16 x i8> %res
 }
 
@@ -580,7 +580,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_offset_negone(<vscale x 16 x i8> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -1)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -1)
   ret <vscale x 16 x i8> %res
 }
 
@@ -596,7 +596,7 @@ define <vscale x 16 x i8> @splice_nxv16i8_offset_min(<vscale x 16 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -32)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -32)
   ret <vscale x 16 x i8> %res
 }
 
@@ -611,17 +611,17 @@ define <vscale x 16 x i8> @splice_nxv16i8_offset_max(<vscale x 16 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.experimental.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 31)
+  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 31)
   ret <vscale x 16 x i8> %res
 }
 
-declare <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
+declare <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, i32)
 
 define <vscale x 32 x i8> @splice_nxv32i8_offset_zero(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) #0 {
 ; CHECK-LABEL: splice_nxv32i8_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 0)
+  %res = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 0)
   ret <vscale x 32 x i8> %res
 }
 
@@ -636,7 +636,7 @@ define <vscale x 32 x i8> @splice_nxv32i8_offset_negone(<vscale x 32 x i8> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 -1)
+  %res = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 -1)
   ret <vscale x 32 x i8> %res
 }
 
@@ -652,7 +652,7 @@ define <vscale x 32 x i8> @splice_nxv32i8_offset_min(<vscale x 32 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 -64)
+  %res = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 -64)
   ret <vscale x 32 x i8> %res
 }
 
@@ -668,17 +668,17 @@ define <vscale x 32 x i8> @splice_nxv32i8_offset_max(<vscale x 32 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i8> @llvm.experimental.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 63)
+  %res = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, i32 63)
   ret <vscale x 32 x i8> %res
 }
 
-declare <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, i32)
+declare <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, i32)
 
 define <vscale x 64 x i8> @splice_nxv64i8_offset_zero(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b) #0 {
 ; CHECK-LABEL: splice_nxv64i8_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 0)
+  %res = call <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 0)
   ret <vscale x 64 x i8> %res
 }
 
@@ -693,7 +693,7 @@ define <vscale x 64 x i8> @splice_nxv64i8_offset_negone(<vscale x 64 x i8> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 -1)
+  %res = call <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 -1)
   ret <vscale x 64 x i8> %res
 }
 
@@ -709,7 +709,7 @@ define <vscale x 64 x i8> @splice_nxv64i8_offset_min(<vscale x 64 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 -128)
+  %res = call <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 -128)
   ret <vscale x 64 x i8> %res
 }
 
@@ -725,17 +725,17 @@ define <vscale x 64 x i8> @splice_nxv64i8_offset_max(<vscale x 64 x i8> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 64 x i8> @llvm.experimental.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 127)
+  %res = call <vscale x 64 x i8> @llvm.vector.splice.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, i32 127)
   ret <vscale x 64 x i8> %res
 }
 
-declare <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, i32)
+declare <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, i32)
 
 define <vscale x 1 x i16> @splice_nxv1i16_offset_zero(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b) #0 {
 ; CHECK-LABEL: splice_nxv1i16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 0)
+  %res = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 0)
   ret <vscale x 1 x i16> %res
 }
 
@@ -750,7 +750,7 @@ define <vscale x 1 x i16> @splice_nxv1i16_offset_negone(<vscale x 1 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 -1)
+  %res = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 -1)
   ret <vscale x 1 x i16> %res
 }
 
@@ -765,7 +765,7 @@ define <vscale x 1 x i16> @splice_nxv1i16_offset_min(<vscale x 1 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 -2)
+  %res = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 -2)
   ret <vscale x 1 x i16> %res
 }
 
@@ -780,17 +780,17 @@ define <vscale x 1 x i16> @splice_nxv1i16_offset_max(<vscale x 1 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i16> @llvm.experimental.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 1)
+  %res = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, i32 1)
   ret <vscale x 1 x i16> %res
 }
 
-declare <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
+declare <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, i32)
 
 define <vscale x 2 x i16> @splice_nxv2i16_offset_zero(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b) #0 {
 ; CHECK-LABEL: splice_nxv2i16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 0)
+  %res = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 0)
   ret <vscale x 2 x i16> %res
 }
 
@@ -805,7 +805,7 @@ define <vscale x 2 x i16> @splice_nxv2i16_offset_negone(<vscale x 2 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 -1)
+  %res = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 -1)
   ret <vscale x 2 x i16> %res
 }
 
@@ -820,7 +820,7 @@ define <vscale x 2 x i16> @splice_nxv2i16_offset_min(<vscale x 2 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 -4)
+  %res = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 -4)
   ret <vscale x 2 x i16> %res
 }
 
@@ -835,17 +835,17 @@ define <vscale x 2 x i16> @splice_nxv2i16_offset_max(<vscale x 2 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i16> @llvm.experimental.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 3)
+  %res = call <vscale x 2 x i16> @llvm.vector.splice.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, i32 3)
   ret <vscale x 2 x i16> %res
 }
 
-declare <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
+declare <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32)
 
 define <vscale x 4 x i16> @splice_nxv4i16_offset_zero(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b) #0 {
 ; CHECK-LABEL: splice_nxv4i16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 0)
+  %res = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 0)
   ret <vscale x 4 x i16> %res
 }
 
@@ -860,7 +860,7 @@ define <vscale x 4 x i16> @splice_nxv4i16_offset_negone(<vscale x 4 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 -1)
+  %res = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 -1)
   ret <vscale x 4 x i16> %res
 }
 
@@ -875,7 +875,7 @@ define <vscale x 4 x i16> @splice_nxv4i16_offset_min(<vscale x 4 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 -8)
+  %res = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 -8)
   ret <vscale x 4 x i16> %res
 }
 
@@ -890,17 +890,17 @@ define <vscale x 4 x i16> @splice_nxv4i16_offset_max(<vscale x 4 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 7)
+  %res = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, i32 7)
   ret <vscale x 4 x i16> %res
 }
 
-declare <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
 
 define <vscale x 8 x i16> @splice_nxv8i16_offset_zero(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
 ; CHECK-LABEL: splice_nxv8i16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 0)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 0)
   ret <vscale x 8 x i16> %res
 }
 
@@ -914,7 +914,7 @@ define <vscale x 8 x i16> @splice_nxv8i16_offset_negone(<vscale x 8 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -1)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -1)
   ret <vscale x 8 x i16> %res
 }
 
@@ -928,7 +928,7 @@ define <vscale x 8 x i16> @splice_nxv8i16_offset_min(<vscale x 8 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -16)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -16)
   ret <vscale x 8 x i16> %res
 }
 
@@ -942,17 +942,17 @@ define <vscale x 8 x i16> @splice_nxv8i16_offset_max(<vscale x 8 x i16> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i16> @llvm.experimental.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 15)
+  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 15)
   ret <vscale x 8 x i16> %res
 }
 
-declare <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
+declare <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, i32)
 
 define <vscale x 16 x i16> @splice_nxv16i16_offset_zero(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) #0 {
 ; CHECK-LABEL: splice_nxv16i16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 0)
+  %res = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 0)
   ret <vscale x 16 x i16> %res
 }
 
@@ -967,7 +967,7 @@ define <vscale x 16 x i16> @splice_nxv16i16_offset_negone(<vscale x 16 x i16> %a
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 -1)
+  %res = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 -1)
   ret <vscale x 16 x i16> %res
 }
 
@@ -983,7 +983,7 @@ define <vscale x 16 x i16> @splice_nxv16i16_offset_min(<vscale x 16 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 -32)
+  %res = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 -32)
   ret <vscale x 16 x i16> %res
 }
 
@@ -998,17 +998,17 @@ define <vscale x 16 x i16> @splice_nxv16i16_offset_max(<vscale x 16 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i16> @llvm.experimental.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 31)
+  %res = call <vscale x 16 x i16> @llvm.vector.splice.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, i32 31)
   ret <vscale x 16 x i16> %res
 }
 
-declare <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, i32)
+declare <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, i32)
 
 define <vscale x 32 x i16> @splice_nxv32i16_offset_zero(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b) #0 {
 ; CHECK-LABEL: splice_nxv32i16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 0)
+  %res = call <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 0)
   ret <vscale x 32 x i16> %res
 }
 
@@ -1023,7 +1023,7 @@ define <vscale x 32 x i16> @splice_nxv32i16_offset_negone(<vscale x 32 x i16> %a
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 -1)
+  %res = call <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 -1)
   ret <vscale x 32 x i16> %res
 }
 
@@ -1039,7 +1039,7 @@ define <vscale x 32 x i16> @splice_nxv32i16_offset_min(<vscale x 32 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 -64)
+  %res = call <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 -64)
   ret <vscale x 32 x i16> %res
 }
 
@@ -1055,17 +1055,17 @@ define <vscale x 32 x i16> @splice_nxv32i16_offset_max(<vscale x 32 x i16> %a, <
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x i16> @llvm.experimental.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 63)
+  %res = call <vscale x 32 x i16> @llvm.vector.splice.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, i32 63)
   ret <vscale x 32 x i16> %res
 }
 
-declare <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, i32)
+declare <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, i32)
 
 define <vscale x 1 x i32> @splice_nxv1i32_offset_zero(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b) #0 {
 ; CHECK-LABEL: splice_nxv1i32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 0)
+  %res = call <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 0)
   ret <vscale x 1 x i32> %res
 }
 
@@ -1080,7 +1080,7 @@ define <vscale x 1 x i32> @splice_nxv1i32_offset_negone(<vscale x 1 x i32> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 -1)
+  %res = call <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 -1)
   ret <vscale x 1 x i32> %res
 }
 
@@ -1095,7 +1095,7 @@ define <vscale x 1 x i32> @splice_nxv1i32_offset_min(<vscale x 1 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 -2)
+  %res = call <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 -2)
   ret <vscale x 1 x i32> %res
 }
 
@@ -1110,17 +1110,17 @@ define <vscale x 1 x i32> @splice_nxv1i32_offset_max(<vscale x 1 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i32> @llvm.experimental.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 1)
+  %res = call <vscale x 1 x i32> @llvm.vector.splice.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, i32 1)
   ret <vscale x 1 x i32> %res
 }
 
-declare <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, i32)
+declare <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, i32)
 
 define <vscale x 2 x i32> @splice_nxv2i32_offset_zero(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b) #0 {
 ; CHECK-LABEL: splice_nxv2i32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 0)
+  %res = call <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 0)
   ret <vscale x 2 x i32> %res
 }
 
@@ -1135,7 +1135,7 @@ define <vscale x 2 x i32> @splice_nxv2i32_offset_negone(<vscale x 2 x i32> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 -1)
+  %res = call <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 -1)
   ret <vscale x 2 x i32> %res
 }
 
@@ -1150,7 +1150,7 @@ define <vscale x 2 x i32> @splice_nxv2i32_offset_min(<vscale x 2 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 -4)
+  %res = call <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 -4)
   ret <vscale x 2 x i32> %res
 }
 
@@ -1165,17 +1165,17 @@ define <vscale x 2 x i32> @splice_nxv2i32_offset_max(<vscale x 2 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i32> @llvm.experimental.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 3)
+  %res = call <vscale x 2 x i32> @llvm.vector.splice.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, i32 3)
   ret <vscale x 2 x i32> %res
 }
 
-declare <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
 
 define <vscale x 4 x i32> @splice_nxv4i32_offset_zero(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
 ; CHECK-LABEL: splice_nxv4i32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 0)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 0)
   ret <vscale x 4 x i32> %res
 }
 
@@ -1190,7 +1190,7 @@ define <vscale x 4 x i32> @splice_nxv4i32_offset_negone(<vscale x 4 x i32> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -1)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -1)
   ret <vscale x 4 x i32> %res
 }
 
@@ -1205,7 +1205,7 @@ define <vscale x 4 x i32> @splice_nxv4i32_offset_min(<vscale x 4 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -8)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -8)
   ret <vscale x 4 x i32> %res
 }
 
@@ -1220,17 +1220,17 @@ define <vscale x 4 x i32> @splice_nxv4i32_offset_max(<vscale x 4 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 7)
+  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 7)
   ret <vscale x 4 x i32> %res
 }
 
-declare <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
+declare <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
 
 define <vscale x 8 x i32> @splice_nxv8i32_offset_zero(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b) #0 {
 ; CHECK-LABEL: splice_nxv8i32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 0)
+  %res = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 0)
   ret <vscale x 8 x i32> %res
 }
 
@@ -1244,7 +1244,7 @@ define <vscale x 8 x i32> @splice_nxv8i32_offset_negone(<vscale x 8 x i32> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 -1)
+  %res = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 -1)
   ret <vscale x 8 x i32> %res
 }
 
@@ -1258,7 +1258,7 @@ define <vscale x 8 x i32> @splice_nxv8i32_offset_min(<vscale x 8 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 -16)
+  %res = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 -16)
   ret <vscale x 8 x i32> %res
 }
 
@@ -1272,17 +1272,17 @@ define <vscale x 8 x i32> @splice_nxv8i32_offset_max(<vscale x 8 x i32> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i32> @llvm.experimental.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 15)
+  %res = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 15)
   ret <vscale x 8 x i32> %res
 }
 
-declare <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, i32)
+declare <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, i32)
 
 define <vscale x 16 x i32> @splice_nxv16i32_offset_zero(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b) #0 {
 ; CHECK-LABEL: splice_nxv16i32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 0)
+  %res = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 0)
   ret <vscale x 16 x i32> %res
 }
 
@@ -1297,7 +1297,7 @@ define <vscale x 16 x i32> @splice_nxv16i32_offset_negone(<vscale x 16 x i32> %a
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 -1)
+  %res = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 -1)
   ret <vscale x 16 x i32> %res
 }
 
@@ -1313,7 +1313,7 @@ define <vscale x 16 x i32> @splice_nxv16i32_offset_min(<vscale x 16 x i32> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 -32)
+  %res = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 -32)
   ret <vscale x 16 x i32> %res
 }
 
@@ -1328,17 +1328,17 @@ define <vscale x 16 x i32> @splice_nxv16i32_offset_max(<vscale x 16 x i32> %a, <
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i32> @llvm.experimental.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 31)
+  %res = call <vscale x 16 x i32> @llvm.vector.splice.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, i32 31)
   ret <vscale x 16 x i32> %res
 }
 
-declare <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, i32)
+declare <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, i32)
 
 define <vscale x 1 x i64> @splice_nxv1i64_offset_zero(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b) #0 {
 ; CHECK-LABEL: splice_nxv1i64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 0)
+  %res = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 0)
   ret <vscale x 1 x i64> %res
 }
 
@@ -1353,7 +1353,7 @@ define <vscale x 1 x i64> @splice_nxv1i64_offset_negone(<vscale x 1 x i64> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 -1)
+  %res = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 -1)
   ret <vscale x 1 x i64> %res
 }
 
@@ -1368,7 +1368,7 @@ define <vscale x 1 x i64> @splice_nxv1i64_offset_min(<vscale x 1 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 -2)
+  %res = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 -2)
   ret <vscale x 1 x i64> %res
 }
 
@@ -1383,17 +1383,17 @@ define <vscale x 1 x i64> @splice_nxv1i64_offset_max(<vscale x 1 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x i64> @llvm.experimental.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 1)
+  %res = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, i32 1)
   ret <vscale x 1 x i64> %res
 }
 
-declare <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
+declare <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
 
 define <vscale x 2 x i64> @splice_nxv2i64_offset_zero(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
 ; CHECK-LABEL: splice_nxv2i64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 0)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 0)
   ret <vscale x 2 x i64> %res
 }
 
@@ -1408,7 +1408,7 @@ define <vscale x 2 x i64> @splice_nxv2i64_offset_negone(<vscale x 2 x i64> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -1)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -1)
   ret <vscale x 2 x i64> %res
 }
 
@@ -1423,7 +1423,7 @@ define <vscale x 2 x i64> @splice_nxv2i64_offset_min(<vscale x 2 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -4)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -4)
   ret <vscale x 2 x i64> %res
 }
 
@@ -1438,17 +1438,17 @@ define <vscale x 2 x i64> @splice_nxv2i64_offset_max(<vscale x 2 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.experimental.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 3)
+  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 3)
   ret <vscale x 2 x i64> %res
 }
 
-declare <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
+declare <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i32)
 
 define <vscale x 4 x i64> @splice_nxv4i64_offset_zero(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b) #0 {
 ; CHECK-LABEL: splice_nxv4i64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 0)
+  %res = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 0)
   ret <vscale x 4 x i64> %res
 }
 
@@ -1463,7 +1463,7 @@ define <vscale x 4 x i64> @splice_nxv4i64_offset_negone(<vscale x 4 x i64> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 -1)
+  %res = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 -1)
   ret <vscale x 4 x i64> %res
 }
 
@@ -1478,7 +1478,7 @@ define <vscale x 4 x i64> @splice_nxv4i64_offset_min(<vscale x 4 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 -8)
+  %res = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 -8)
   ret <vscale x 4 x i64> %res
 }
 
@@ -1493,17 +1493,17 @@ define <vscale x 4 x i64> @splice_nxv4i64_offset_max(<vscale x 4 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 7)
+  %res = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, i32 7)
   ret <vscale x 4 x i64> %res
 }
 
-declare <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, i32)
+declare <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, i32)
 
 define <vscale x 8 x i64> @splice_nxv8i64_offset_zero(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b) #0 {
 ; CHECK-LABEL: splice_nxv8i64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 0)
+  %res = call <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 0)
   ret <vscale x 8 x i64> %res
 }
 
@@ -1517,7 +1517,7 @@ define <vscale x 8 x i64> @splice_nxv8i64_offset_negone(<vscale x 8 x i64> %a, <
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 -1)
+  %res = call <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 -1)
   ret <vscale x 8 x i64> %res
 }
 
@@ -1531,7 +1531,7 @@ define <vscale x 8 x i64> @splice_nxv8i64_offset_min(<vscale x 8 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 -16)
+  %res = call <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 -16)
   ret <vscale x 8 x i64> %res
 }
 
@@ -1545,17 +1545,17 @@ define <vscale x 8 x i64> @splice_nxv8i64_offset_max(<vscale x 8 x i64> %a, <vsc
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x i64> @llvm.experimental.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 15)
+  %res = call <vscale x 8 x i64> @llvm.vector.splice.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, i32 15)
   ret <vscale x 8 x i64> %res
 }
 
-declare <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, i32)
+declare <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, i32)
 
 define <vscale x 1 x half> @splice_nxv1f16_offset_zero(<vscale x 1 x half> %a, <vscale x 1 x half> %b) #0 {
 ; CHECK-LABEL: splice_nxv1f16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 0)
+  %res = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 0)
   ret <vscale x 1 x half> %res
 }
 
@@ -1570,7 +1570,7 @@ define <vscale x 1 x half> @splice_nxv1f16_offset_negone(<vscale x 1 x half> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 -1)
+  %res = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 -1)
   ret <vscale x 1 x half> %res
 }
 
@@ -1585,7 +1585,7 @@ define <vscale x 1 x half> @splice_nxv1f16_offset_min(<vscale x 1 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 -2)
+  %res = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 -2)
   ret <vscale x 1 x half> %res
 }
 
@@ -1600,17 +1600,17 @@ define <vscale x 1 x half> @splice_nxv1f16_offset_max(<vscale x 1 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x half> @llvm.experimental.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 1)
+  %res = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> %a, <vscale x 1 x half> %b, i32 1)
   ret <vscale x 1 x half> %res
 }
 
-declare <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
+declare <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
 
 define <vscale x 2 x half> @splice_nxv2f16_offset_zero(<vscale x 2 x half> %a, <vscale x 2 x half> %b) #0 {
 ; CHECK-LABEL: splice_nxv2f16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 0)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 0)
   ret <vscale x 2 x half> %res
 }
 
@@ -1625,7 +1625,7 @@ define <vscale x 2 x half> @splice_nxv2f16_offset_negone(<vscale x 2 x half> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -1)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -1)
   ret <vscale x 2 x half> %res
 }
 
@@ -1640,7 +1640,7 @@ define <vscale x 2 x half> @splice_nxv2f16_offset_min(<vscale x 2 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -4)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -4)
   ret <vscale x 2 x half> %res
 }
 
@@ -1655,17 +1655,17 @@ define <vscale x 2 x half> @splice_nxv2f16_offset_max(<vscale x 2 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x half> @llvm.experimental.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 3)
+  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 3)
   ret <vscale x 2 x half> %res
 }
 
-declare <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
+declare <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
 
 define <vscale x 4 x half> @splice_nxv4f16_offset_zero(<vscale x 4 x half> %a, <vscale x 4 x half> %b) #0 {
 ; CHECK-LABEL: splice_nxv4f16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 0)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 0)
   ret <vscale x 4 x half> %res
 }
 
@@ -1680,7 +1680,7 @@ define <vscale x 4 x half> @splice_nxv4f16_offset_negone(<vscale x 4 x half> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -1)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -1)
   ret <vscale x 4 x half> %res
 }
 
@@ -1695,7 +1695,7 @@ define <vscale x 4 x half> @splice_nxv4f16_offset_min(<vscale x 4 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -8)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -8)
   ret <vscale x 4 x half> %res
 }
 
@@ -1710,17 +1710,17 @@ define <vscale x 4 x half> @splice_nxv4f16_offset_max(<vscale x 4 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.experimental.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 7)
+  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 7)
   ret <vscale x 4 x half> %res
 }
 
-declare <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
+declare <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
 
 define <vscale x 8 x half> @splice_nxv8f16_offset_zero(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
 ; CHECK-LABEL: splice_nxv8f16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 0)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 0)
   ret <vscale x 8 x half> %res
 }
 
@@ -1734,7 +1734,7 @@ define <vscale x 8 x half> @splice_nxv8f16_offset_negone(<vscale x 8 x half> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -1)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -1)
   ret <vscale x 8 x half> %res
 }
 
@@ -1748,7 +1748,7 @@ define <vscale x 8 x half> @splice_nxv8f16_offset_min(<vscale x 8 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -16)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -16)
   ret <vscale x 8 x half> %res
 }
 
@@ -1762,17 +1762,17 @@ define <vscale x 8 x half> @splice_nxv8f16_offset_max(<vscale x 8 x half> %a, <v
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.experimental.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 15)
+  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 15)
   ret <vscale x 8 x half> %res
 }
 
-declare <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>, i32)
+declare <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>, i32)
 
 define <vscale x 16 x half> @splice_nxv16f16_offset_zero(<vscale x 16 x half> %a, <vscale x 16 x half> %b) #0 {
 ; CHECK-LABEL: splice_nxv16f16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 0)
+  %res = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 0)
   ret <vscale x 16 x half> %res
 }
 
@@ -1787,7 +1787,7 @@ define <vscale x 16 x half> @splice_nxv16f16_offset_negone(<vscale x 16 x half>
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 -1)
+  %res = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 -1)
   ret <vscale x 16 x half> %res
 }
 
@@ -1803,7 +1803,7 @@ define <vscale x 16 x half> @splice_nxv16f16_offset_min(<vscale x 16 x half> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 -32)
+  %res = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 -32)
   ret <vscale x 16 x half> %res
 }
 
@@ -1818,17 +1818,17 @@ define <vscale x 16 x half> @splice_nxv16f16_offset_max(<vscale x 16 x half> %a,
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.experimental.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 31)
+  %res = call <vscale x 16 x half> @llvm.vector.splice.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b, i32 31)
   ret <vscale x 16 x half> %res
 }
 
-declare <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half>, <vscale x 32 x half>, i32)
+declare <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half>, <vscale x 32 x half>, i32)
 
 define <vscale x 32 x half> @splice_nxv32f16_offset_zero(<vscale x 32 x half> %a, <vscale x 32 x half> %b) #0 {
 ; CHECK-LABEL: splice_nxv32f16_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 0)
+  %res = call <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 0)
   ret <vscale x 32 x half> %res
 }
 
@@ -1843,7 +1843,7 @@ define <vscale x 32 x half> @splice_nxv32f16_offset_negone(<vscale x 32 x half>
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 -1)
+  %res = call <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 -1)
   ret <vscale x 32 x half> %res
 }
 
@@ -1859,7 +1859,7 @@ define <vscale x 32 x half> @splice_nxv32f16_offset_min(<vscale x 32 x half> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 -64)
+  %res = call <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 -64)
   ret <vscale x 32 x half> %res
 }
 
@@ -1875,17 +1875,17 @@ define <vscale x 32 x half> @splice_nxv32f16_offset_max(<vscale x 32 x half> %a,
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 32 x half> @llvm.experimental.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 63)
+  %res = call <vscale x 32 x half> @llvm.vector.splice.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b, i32 63)
   ret <vscale x 32 x half> %res
 }
 
-declare <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float>, <vscale x 1 x float>, i32)
+declare <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float>, <vscale x 1 x float>, i32)
 
 define <vscale x 1 x float> @splice_nxv1f32_offset_zero(<vscale x 1 x float> %a, <vscale x 1 x float> %b) #0 {
 ; CHECK-LABEL: splice_nxv1f32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 0)
+  %res = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 0)
   ret <vscale x 1 x float> %res
 }
 
@@ -1900,7 +1900,7 @@ define <vscale x 1 x float> @splice_nxv1f32_offset_negone(<vscale x 1 x float> %
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 -1)
+  %res = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 -1)
   ret <vscale x 1 x float> %res
 }
 
@@ -1915,7 +1915,7 @@ define <vscale x 1 x float> @splice_nxv1f32_offset_min(<vscale x 1 x float> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 -2)
+  %res = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 -2)
   ret <vscale x 1 x float> %res
 }
 
@@ -1930,17 +1930,17 @@ define <vscale x 1 x float> @splice_nxv1f32_offset_max(<vscale x 1 x float> %a,
 ; CHECK-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x float> @llvm.experimental.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 1)
+  %res = call <vscale x 1 x float> @llvm.vector.splice.nxv1f32(<vscale x 1 x float> %a, <vscale x 1 x float> %b, i32 1)
   ret <vscale x 1 x float> %res
 }
 
-declare <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
+declare <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
 
 define <vscale x 2 x float> @splice_nxv2f32_offset_zero(<vscale x 2 x float> %a, <vscale x 2 x float> %b) #0 {
 ; CHECK-LABEL: splice_nxv2f32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 0)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 0)
   ret <vscale x 2 x float> %res
 }
 
@@ -1955,7 +1955,7 @@ define <vscale x 2 x float> @splice_nxv2f32_offset_negone(<vscale x 2 x float> %
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -1)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -1)
   ret <vscale x 2 x float> %res
 }
 
@@ -1970,7 +1970,7 @@ define <vscale x 2 x float> @splice_nxv2f32_offset_min(<vscale x 2 x float> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -4)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -4)
   ret <vscale x 2 x float> %res
 }
 
@@ -1985,17 +1985,17 @@ define <vscale x 2 x float> @splice_nxv2f32_offset_max(<vscale x 2 x float> %a,
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x float> @llvm.experimental.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 3)
+  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 3)
   ret <vscale x 2 x float> %res
 }
 
-declare <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
+declare <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
 
 define <vscale x 4 x float> @splice_nxv4f32_offset_zero(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
 ; CHECK-LABEL: splice_nxv4f32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 0)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 0)
   ret <vscale x 4 x float> %res
 }
 
@@ -2010,7 +2010,7 @@ define <vscale x 4 x float> @splice_nxv4f32_offset_negone(<vscale x 4 x float> %
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -1)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -1)
   ret <vscale x 4 x float> %res
 }
 
@@ -2025,7 +2025,7 @@ define <vscale x 4 x float> @splice_nxv4f32_offset_min(<vscale x 4 x float> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -8)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -8)
   ret <vscale x 4 x float> %res
 }
 
@@ -2040,17 +2040,17 @@ define <vscale x 4 x float> @splice_nxv4f32_offset_max(<vscale x 4 x float> %a,
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.experimental.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 7)
+  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 7)
   ret <vscale x 4 x float> %res
 }
 
-declare <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>, i32)
+declare <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>, i32)
 
 define <vscale x 8 x float> @splice_nxv8f32_offset_zero(<vscale x 8 x float> %a, <vscale x 8 x float> %b) #0 {
 ; CHECK-LABEL: splice_nxv8f32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 0)
+  %res = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 0)
   ret <vscale x 8 x float> %res
 }
 
@@ -2064,7 +2064,7 @@ define <vscale x 8 x float> @splice_nxv8f32_offset_negone(<vscale x 8 x float> %
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 -1)
+  %res = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 -1)
   ret <vscale x 8 x float> %res
 }
 
@@ -2078,7 +2078,7 @@ define <vscale x 8 x float> @splice_nxv8f32_offset_min(<vscale x 8 x float> %a,
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 -16)
+  %res = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 -16)
   ret <vscale x 8 x float> %res
 }
 
@@ -2092,17 +2092,17 @@ define <vscale x 8 x float> @splice_nxv8f32_offset_max(<vscale x 8 x float> %a,
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.experimental.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 15)
+  %res = call <vscale x 8 x float> @llvm.vector.splice.nxv8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b, i32 15)
   ret <vscale x 8 x float> %res
 }
 
-declare <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
+declare <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
 
 define <vscale x 16 x float> @splice_nxv16f32_offset_zero(<vscale x 16 x float> %a, <vscale x 16 x float> %b) #0 {
 ; CHECK-LABEL: splice_nxv16f32_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 0)
+  %res = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 0)
   ret <vscale x 16 x float> %res
 }
 
@@ -2117,7 +2117,7 @@ define <vscale x 16 x float> @splice_nxv16f32_offset_negone(<vscale x 16 x float
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 -1)
+  %res = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 -1)
   ret <vscale x 16 x float> %res
 }
 
@@ -2133,7 +2133,7 @@ define <vscale x 16 x float> @splice_nxv16f32_offset_min(<vscale x 16 x float> %
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 -32)
+  %res = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 -32)
   ret <vscale x 16 x float> %res
 }
 
@@ -2148,17 +2148,17 @@ define <vscale x 16 x float> @splice_nxv16f32_offset_max(<vscale x 16 x float> %
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x float> @llvm.experimental.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 31)
+  %res = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 31)
   ret <vscale x 16 x float> %res
 }
 
-declare <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, i32)
+declare <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, i32)
 
 define <vscale x 1 x double> @splice_nxv1f64_offset_zero(<vscale x 1 x double> %a, <vscale x 1 x double> %b) #0 {
 ; CHECK-LABEL: splice_nxv1f64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 0)
+  %res = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 0)
   ret <vscale x 1 x double> %res
 }
 
@@ -2173,7 +2173,7 @@ define <vscale x 1 x double> @splice_nxv1f64_offset_negone(<vscale x 1 x double>
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 -1)
+  %res = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 -1)
   ret <vscale x 1 x double> %res
 }
 
@@ -2188,7 +2188,7 @@ define <vscale x 1 x double> @splice_nxv1f64_offset_min(<vscale x 1 x double> %a
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 -2)
+  %res = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 -2)
   ret <vscale x 1 x double> %res
 }
 
@@ -2203,17 +2203,17 @@ define <vscale x 1 x double> @splice_nxv1f64_offset_max(<vscale x 1 x double> %a
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 1 x double> @llvm.experimental.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 1)
+  %res = call <vscale x 1 x double> @llvm.vector.splice.nxv1f64(<vscale x 1 x double> %a, <vscale x 1 x double> %b, i32 1)
   ret <vscale x 1 x double> %res
 }
 
-declare <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
+declare <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
 
 define <vscale x 2 x double> @splice_nxv2f64_offset_zero(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
 ; CHECK-LABEL: splice_nxv2f64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 0)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 0)
   ret <vscale x 2 x double> %res
 }
 
@@ -2228,7 +2228,7 @@ define <vscale x 2 x double> @splice_nxv2f64_offset_negone(<vscale x 2 x double>
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -1)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -1)
   ret <vscale x 2 x double> %res
 }
 
@@ -2243,7 +2243,7 @@ define <vscale x 2 x double> @splice_nxv2f64_offset_min(<vscale x 2 x double> %a
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -4)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -4)
   ret <vscale x 2 x double> %res
 }
 
@@ -2258,17 +2258,17 @@ define <vscale x 2 x double> @splice_nxv2f64_offset_max(<vscale x 2 x double> %a
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 3)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 3)
   ret <vscale x 2 x double> %res
 }
 
-declare <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>, i32)
+declare <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>, i32)
 
 define <vscale x 4 x double> @splice_nxv4f64_offset_zero(<vscale x 4 x double> %a, <vscale x 4 x double> %b) #0 {
 ; CHECK-LABEL: splice_nxv4f64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 0)
+  %res = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 0)
   ret <vscale x 4 x double> %res
 }
 
@@ -2283,7 +2283,7 @@ define <vscale x 4 x double> @splice_nxv4f64_offset_negone(<vscale x 4 x double>
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 -1)
+  %res = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 -1)
   ret <vscale x 4 x double> %res
 }
 
@@ -2298,7 +2298,7 @@ define <vscale x 4 x double> @splice_nxv4f64_offset_min(<vscale x 4 x double> %a
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 -8)
+  %res = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 -8)
   ret <vscale x 4 x double> %res
 }
 
@@ -2313,17 +2313,17 @@ define <vscale x 4 x double> @splice_nxv4f64_offset_max(<vscale x 4 x double> %a
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 7)
+  %res = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b, i32 7)
   ret <vscale x 4 x double> %res
 }
 
-declare <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double>, <vscale x 8 x double>, i32)
+declare <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double>, <vscale x 8 x double>, i32)
 
 define <vscale x 8 x double> @splice_nxv8f64_offset_zero(<vscale x 8 x double> %a, <vscale x 8 x double> %b) #0 {
 ; CHECK-LABEL: splice_nxv8f64_offset_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 0)
+  %res = call <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 0)
   ret <vscale x 8 x double> %res
 }
 
@@ -2337,7 +2337,7 @@ define <vscale x 8 x double> @splice_nxv8f64_offset_negone(<vscale x 8 x double>
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 1
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 -1)
+  %res = call <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 -1)
   ret <vscale x 8 x double> %res
 }
 
@@ -2351,7 +2351,7 @@ define <vscale x 8 x double> @splice_nxv8f64_offset_min(<vscale x 8 x double> %a
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v16, 16
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 -16)
+  %res = call <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 -16)
   ret <vscale x 8 x double> %res
 }
 
@@ -2365,7 +2365,7 @@ define <vscale x 8 x double> @splice_nxv8f64_offset_max(<vscale x 8 x double> %a
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x double> @llvm.experimental.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 15)
+  %res = call <vscale x 8 x double> @llvm.vector.splice.nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b, i32 15)
   ret <vscale x 8 x double> %res
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
index 621445fb2dc5..4ff2fc7a5fff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
@@ -487,6 +487,54 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+define void @saxpy_vec_demanded_fields(i64 %n, float %a, ptr nocapture readonly %x, ptr nocapture %y) {
+; CHECK-LABEL: saxpy_vec_demanded_fields:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a3, a0, e32, m8, ta, ma
+; CHECK-NEXT:    beqz a3, .LBB9_2
+; CHECK-NEXT:  .LBB9_1: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a1)
+; CHECK-NEXT:    vle32.v v16, (a2)
+; CHECK-NEXT:    slli a4, a3, 2
+; CHECK-NEXT:    add a1, a1, a4
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, tu, ma
+; CHECK-NEXT:    vfmacc.vf v16, fa0, v8
+; CHECK-NEXT:    vse32.v v16, (a2)
+; CHECK-NEXT:    sub a0, a0, a3
+; CHECK-NEXT:    vsetvli a3, a0, e16, m4, ta, ma
+; CHECK-NEXT:    add a2, a2, a4
+; CHECK-NEXT:    bnez a3, .LBB9_1
+; CHECK-NEXT:  .LBB9_2: # %for.end
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call i64 @llvm.riscv.vsetvli.i64(i64 %n, i64 2, i64 3)
+  %cmp.not13 = icmp eq i64 %0, 0
+  br i1 %cmp.not13, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %1 = phi i64 [ %7, %for.body ], [ %0, %entry ]
+  %n.addr.016 = phi i64 [ %sub, %for.body ], [ %n, %entry ]
+  %x.addr.015 = phi ptr [ %add.ptr, %for.body ], [ %x, %entry ]
+  %y.addr.014 = phi ptr [ %add.ptr1, %for.body ], [ %y, %entry ]
+  %2 = bitcast ptr %x.addr.015 to ptr
+  %3 = tail call <vscale x 16 x float> @llvm.riscv.vle.nxv16f32.i64(<vscale x 16 x float> undef, ptr %2, i64 %1)
+  %add.ptr = getelementptr inbounds float, ptr %x.addr.015, i64 %1
+  %4 = bitcast ptr %y.addr.014 to ptr
+  %5 = tail call <vscale x 16 x float> @llvm.riscv.vle.nxv16f32.i64(<vscale x 16 x float> undef, ptr %4, i64 %1)
+  %6 = tail call <vscale x 16 x float> @llvm.riscv.vfmacc.nxv16f32.f32.i64(<vscale x 16 x float> %5, float %a, <vscale x 16 x float> %3, i64 7, i64 %1, i64 0)
+  tail call void @llvm.riscv.vse.nxv16f32.i64(<vscale x 16 x float> %6, ptr %4, i64 %1)
+  %add.ptr1 = getelementptr inbounds float, ptr %y.addr.014, i64 %1
+  %sub = sub i64 %n.addr.016, %1
+  %7 = tail call i64 @llvm.riscv.vsetvli.i64(i64 %sub, i64 1, i64 2)
+  %cmp.not = icmp eq i64 %7, 0
+  br i1 %cmp.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg)
 declare <vscale x 16 x float> @llvm.riscv.vle.nxv16f32.i64(<vscale x 16 x float>, ptr nocapture, i64)
 declare <vscale x 16 x float> @llvm.riscv.vfmacc.nxv16f32.f32.i64(<vscale x 16 x float>, float, <vscale x 16 x float>, i64, i64, i64)
@@ -501,12 +549,12 @@ define <vscale x 2 x i32> @test_vsetvli_x0_x0(ptr %x, ptr %y, <vscale x 2 x i32>
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    andi a3, a3, 1
-; CHECK-NEXT:    beqz a3, .LBB9_2
+; CHECK-NEXT:    beqz a3, .LBB10_2
 ; CHECK-NEXT:  # %bb.1: # %if
 ; CHECK-NEXT:    vle16.v v10, (a1)
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vwcvt.x.x.v v8, v10
-; CHECK-NEXT:  .LBB9_2: # %if.end
+; CHECK-NEXT:  .LBB10_2: # %if.end
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -540,19 +588,19 @@ define <vscale x 2 x i32> @test_vsetvli_x0_x0_2(ptr %x, ptr %y, ptr %z, i64 %vl,
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    andi a4, a4, 1
-; CHECK-NEXT:    beqz a4, .LBB10_2
+; CHECK-NEXT:    beqz a4, .LBB11_2
 ; CHECK-NEXT:  # %bb.1: # %if
 ; CHECK-NEXT:    vle16.v v10, (a1)
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vwadd.wv v9, v9, v10
-; CHECK-NEXT:  .LBB10_2: # %if.end
+; CHECK-NEXT:  .LBB11_2: # %if.end
 ; CHECK-NEXT:    andi a5, a5, 1
-; CHECK-NEXT:    beqz a5, .LBB10_4
+; CHECK-NEXT:    beqz a5, .LBB11_4
 ; CHECK-NEXT:  # %bb.3: # %if2
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v10, (a2)
 ; CHECK-NEXT:    vwadd.wv v9, v9, v10
-; CHECK-NEXT:  .LBB10_4: # %if2.end
+; CHECK-NEXT:  .LBB11_4: # %if2.end
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -586,11 +634,11 @@ define void @vlmax(i64 %N, ptr %c, ptr %a, ptr %b) {
 ; CHECK-LABEL: vlmax:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli a6, zero, e64, m1, ta, ma
-; CHECK-NEXT:    blez a0, .LBB11_3
+; CHECK-NEXT:    blez a0, .LBB12_3
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-NEXT:    li a5, 0
 ; CHECK-NEXT:    slli a4, a6, 3
-; CHECK-NEXT:  .LBB11_2: # %for.body
+; CHECK-NEXT:  .LBB12_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle64.v v8, (a2)
 ; CHECK-NEXT:    vle64.v v9, (a3)
@@ -600,8 +648,8 @@ define void @vlmax(i64 %N, ptr %c, ptr %a, ptr %b) {
 ; CHECK-NEXT:    add a1, a1, a4
 ; CHECK-NEXT:    add a3, a3, a4
 ; CHECK-NEXT:    add a2, a2, a4
-; CHECK-NEXT:    blt a5, a0, .LBB11_2
-; CHECK-NEXT:  .LBB11_3: # %for.end
+; CHECK-NEXT:    blt a5, a0, .LBB12_2
+; CHECK-NEXT:  .LBB12_3: # %for.end
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.riscv.vsetvlimax.i64(i64 3, i64 0)
@@ -633,18 +681,18 @@ define void @vector_init_vlmax(i64 %N, ptr %c) {
 ; CHECK-LABEL: vector_init_vlmax:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
-; CHECK-NEXT:    blez a0, .LBB12_3
+; CHECK-NEXT:    blez a0, .LBB13_3
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-NEXT:    li a3, 0
 ; CHECK-NEXT:    slli a4, a2, 3
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:  .LBB12_2: # %for.body
+; CHECK-NEXT:  .LBB13_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vse64.v v8, (a1)
 ; CHECK-NEXT:    add a3, a3, a2
 ; CHECK-NEXT:    add a1, a1, a4
-; CHECK-NEXT:    blt a3, a0, .LBB12_2
-; CHECK-NEXT:  .LBB12_3: # %for.end
+; CHECK-NEXT:    blt a3, a0, .LBB13_2
+; CHECK-NEXT:  .LBB13_3: # %for.end
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.riscv.vsetvlimax.i64(i64 3, i64 0)
@@ -669,20 +717,20 @@ define void @vector_init_vsetvli_N(i64 %N, ptr %c) {
 ; CHECK-LABEL: vector_init_vsetvli_N:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli a2, a0, e64, m1, ta, ma
-; CHECK-NEXT:    blez a0, .LBB13_3
+; CHECK-NEXT:    blez a0, .LBB14_3
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-NEXT:    li a3, 0
 ; CHECK-NEXT:    slli a4, a2, 3
 ; CHECK-NEXT:    vsetvli a5, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:  .LBB13_2: # %for.body
+; CHECK-NEXT:  .LBB14_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a1)
 ; CHECK-NEXT:    add a3, a3, a2
 ; CHECK-NEXT:    add a1, a1, a4
-; CHECK-NEXT:    blt a3, a0, .LBB13_2
-; CHECK-NEXT:  .LBB13_3: # %for.end
+; CHECK-NEXT:    blt a3, a0, .LBB14_2
+; CHECK-NEXT:  .LBB14_3: # %for.end
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.riscv.vsetvli(i64 %N, i64 3, i64 0)
@@ -711,13 +759,13 @@ define void @vector_init_vsetvli_fv(i64 %N, ptr %c) {
 ; CHECK-NEXT:    slli a4, a3, 3
 ; CHECK-NEXT:    vsetvli a5, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:  .LBB14_1: # %for.body
+; CHECK-NEXT:  .LBB15_1: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a1)
 ; CHECK-NEXT:    add a2, a2, a3
 ; CHECK-NEXT:    add a1, a1, a4
-; CHECK-NEXT:    blt a2, a0, .LBB14_1
+; CHECK-NEXT:    blt a2, a0, .LBB15_1
 ; CHECK-NEXT:  # %bb.2: # %for.end
 ; CHECK-NEXT:    ret
 entry:
@@ -745,13 +793,13 @@ define void @vector_init_vsetvli_fv2(i64 %N, ptr %c) {
 ; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:  .LBB15_1: # %for.body
+; CHECK-NEXT:  .LBB16_1: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a1)
 ; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    addi a1, a1, 32
-; CHECK-NEXT:    blt a2, a0, .LBB15_1
+; CHECK-NEXT:    blt a2, a0, .LBB16_1
 ; CHECK-NEXT:  # %bb.2: # %for.end
 ; CHECK-NEXT:    ret
 entry:
@@ -779,13 +827,13 @@ define void @vector_init_vsetvli_fv3(i64 %N, ptr %c) {
 ; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:  .LBB16_1: # %for.body
+; CHECK-NEXT:  .LBB17_1: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a1)
 ; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    addi a1, a1, 32
-; CHECK-NEXT:    blt a2, a0, .LBB16_1
+; CHECK-NEXT:    blt a2, a0, .LBB17_1
 ; CHECK-NEXT:  # %bb.2: # %for.end
 ; CHECK-NEXT:    ret
 entry:
@@ -861,10 +909,10 @@ define <vscale x 1 x double> @compat_store_consistency(i1 %cond, <vscale x 1 x d
 ; CHECK-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
 ; CHECK-NEXT:    vs1r.v v8, (a1)
-; CHECK-NEXT:    beqz a0, .LBB19_2
+; CHECK-NEXT:    beqz a0, .LBB20_2
 ; CHECK-NEXT:  # %bb.1: # %if.then
 ; CHECK-NEXT:    vse32.v v10, (a2)
-; CHECK-NEXT:  .LBB19_2: # %if.end
+; CHECK-NEXT:  .LBB20_2: # %if.end
 ; CHECK-NEXT:    ret
 entry:
   %res = fadd <vscale x 1 x double> %a, %b
@@ -886,16 +934,16 @@ define <vscale x 2 x i32> @test_ratio_only_vmv_s_x(ptr %x, ptr %y, i1 %cond) nou
 ; CHECK-LABEL: test_ratio_only_vmv_s_x:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andi a2, a2, 1
-; CHECK-NEXT:    beqz a2, .LBB20_2
+; CHECK-NEXT:    beqz a2, .LBB21_2
 ; CHECK-NEXT:  # %bb.1: # %if
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a1)
 ; CHECK-NEXT:    vwcvt.x.x.v v8, v9
-; CHECK-NEXT:    j .LBB20_3
-; CHECK-NEXT:  .LBB20_2:
+; CHECK-NEXT:    j .LBB21_3
+; CHECK-NEXT:  .LBB21_2:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:  .LBB20_3: # %if.end
+; CHECK-NEXT:  .LBB21_3: # %if.end
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, zero
 ; CHECK-NEXT:    ret
@@ -918,16 +966,16 @@ define <vscale x 2 x i32> @test_ratio_only_vmv_s_x2(ptr %x, ptr %y, i1 %cond) no
 ; CHECK-LABEL: test_ratio_only_vmv_s_x2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    andi a2, a2, 1
-; CHECK-NEXT:    beqz a2, .LBB21_2
+; CHECK-NEXT:    beqz a2, .LBB22_2
 ; CHECK-NEXT:  # %bb.1: # %if
 ; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    j .LBB21_3
-; CHECK-NEXT:  .LBB21_2:
+; CHECK-NEXT:    j .LBB22_3
+; CHECK-NEXT:  .LBB22_2:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a1)
 ; CHECK-NEXT:    vwcvt.x.x.v v8, v9
-; CHECK-NEXT:  .LBB21_3: # %if.end
+; CHECK-NEXT:  .LBB22_3: # %if.end
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, zero
 ; CHECK-NEXT:    ret
@@ -953,13 +1001,13 @@ define void @pre_over_vle(ptr %A) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addi a1, a0, 800
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:  .LBB22_1: # %vector.body
+; CHECK-NEXT:  .LBB23_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vsext.vf4 v9, v8
 ; CHECK-NEXT:    vse32.v v9, (a0)
 ; CHECK-NEXT:    addi a0, a0, 8
-; CHECK-NEXT:    bne a0, a1, .LBB22_1
+; CHECK-NEXT:    bne a0, a1, .LBB23_1
 ; CHECK-NEXT:  # %bb.2: # %exit
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
index 596ea1c39fce..16c4a1a0a89e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
@@ -130,6 +130,10 @@
     ret void
   }
 
+  define void @pre_undemanded_vl() {
+    ret void
+  }
+
   declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 
   declare <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64.i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, i64) #1
@@ -1041,3 +1045,12 @@ body:             |
     PseudoRET
 
 ...
+---
+name: pre_undemanded_vl
+body: |
+  bb.0:
+    PseudoBR %bb.1
+  bb.1:
+    %x:gpr = PseudoVMV_X_S undef $noreg, 6
+    PseudoBR %bb.1
+...
diff --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll
index e07e52091e9e..ffbbe31412ed 100644
--- a/llvm/test/CodeGen/RISCV/select.ll
+++ b/llvm/test/CodeGen/RISCV/select.ll
@@ -1858,3 +1858,113 @@ define i32 @select_cst6(i1 zeroext %cond) {
   %ret = select i1 %cond, i32 2049, i32 2047
   ret i32 %ret
 }
+
+@select_redundant_czero_eqz_data = global i32 0, align 4
+
+define void @select_redundant_czero_eqz1(ptr %0, ptr %1) {
+; RV32IM-LABEL: select_redundant_czero_eqz1:
+; RV32IM:       # %bb.0: # %entry
+; RV32IM-NEXT:    bnez a0, .LBB49_2
+; RV32IM-NEXT:  # %bb.1:
+; RV32IM-NEXT:    lui a0, %hi(select_redundant_czero_eqz_data)
+; RV32IM-NEXT:    addi a0, a0, %lo(select_redundant_czero_eqz_data)
+; RV32IM-NEXT:  .LBB49_2: # %entry
+; RV32IM-NEXT:    sw a0, 0(a1)
+; RV32IM-NEXT:    ret
+;
+; RV64IM-LABEL: select_redundant_czero_eqz1:
+; RV64IM:       # %bb.0: # %entry
+; RV64IM-NEXT:    bnez a0, .LBB49_2
+; RV64IM-NEXT:  # %bb.1:
+; RV64IM-NEXT:    lui a0, %hi(select_redundant_czero_eqz_data)
+; RV64IM-NEXT:    addi a0, a0, %lo(select_redundant_czero_eqz_data)
+; RV64IM-NEXT:  .LBB49_2: # %entry
+; RV64IM-NEXT:    sd a0, 0(a1)
+; RV64IM-NEXT:    ret
+;
+; RV64IMXVTCONDOPS-LABEL: select_redundant_czero_eqz1:
+; RV64IMXVTCONDOPS:       # %bb.0: # %entry
+; RV64IMXVTCONDOPS-NEXT:    lui a2, %hi(select_redundant_czero_eqz_data)
+; RV64IMXVTCONDOPS-NEXT:    addi a2, a2, %lo(select_redundant_czero_eqz_data)
+; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a2, a2, a0
+; RV64IMXVTCONDOPS-NEXT:    or a0, a2, a0
+; RV64IMXVTCONDOPS-NEXT:    sd a0, 0(a1)
+; RV64IMXVTCONDOPS-NEXT:    ret
+;
+; RV32IMZICOND-LABEL: select_redundant_czero_eqz1:
+; RV32IMZICOND:       # %bb.0: # %entry
+; RV32IMZICOND-NEXT:    lui a2, %hi(select_redundant_czero_eqz_data)
+; RV32IMZICOND-NEXT:    addi a2, a2, %lo(select_redundant_czero_eqz_data)
+; RV32IMZICOND-NEXT:    czero.nez a2, a2, a0
+; RV32IMZICOND-NEXT:    or a0, a2, a0
+; RV32IMZICOND-NEXT:    sw a0, 0(a1)
+; RV32IMZICOND-NEXT:    ret
+;
+; RV64IMZICOND-LABEL: select_redundant_czero_eqz1:
+; RV64IMZICOND:       # %bb.0: # %entry
+; RV64IMZICOND-NEXT:    lui a2, %hi(select_redundant_czero_eqz_data)
+; RV64IMZICOND-NEXT:    addi a2, a2, %lo(select_redundant_czero_eqz_data)
+; RV64IMZICOND-NEXT:    czero.nez a2, a2, a0
+; RV64IMZICOND-NEXT:    or a0, a2, a0
+; RV64IMZICOND-NEXT:    sd a0, 0(a1)
+; RV64IMZICOND-NEXT:    ret
+entry:
+  %3 = icmp eq ptr %0, null
+  %4 = select i1 %3, ptr @select_redundant_czero_eqz_data, ptr %0
+  store ptr %4, ptr %1, align 8
+  ret void
+}
+
+define void @select_redundant_czero_eqz2(ptr %0, ptr %1) {
+; RV32IM-LABEL: select_redundant_czero_eqz2:
+; RV32IM:       # %bb.0: # %entry
+; RV32IM-NEXT:    bnez a0, .LBB50_2
+; RV32IM-NEXT:  # %bb.1: # %entry
+; RV32IM-NEXT:    lui a0, %hi(select_redundant_czero_eqz_data)
+; RV32IM-NEXT:    addi a0, a0, %lo(select_redundant_czero_eqz_data)
+; RV32IM-NEXT:  .LBB50_2: # %entry
+; RV32IM-NEXT:    sw a0, 0(a1)
+; RV32IM-NEXT:    ret
+;
+; RV64IM-LABEL: select_redundant_czero_eqz2:
+; RV64IM:       # %bb.0: # %entry
+; RV64IM-NEXT:    bnez a0, .LBB50_2
+; RV64IM-NEXT:  # %bb.1: # %entry
+; RV64IM-NEXT:    lui a0, %hi(select_redundant_czero_eqz_data)
+; RV64IM-NEXT:    addi a0, a0, %lo(select_redundant_czero_eqz_data)
+; RV64IM-NEXT:  .LBB50_2: # %entry
+; RV64IM-NEXT:    sd a0, 0(a1)
+; RV64IM-NEXT:    ret
+;
+; RV64IMXVTCONDOPS-LABEL: select_redundant_czero_eqz2:
+; RV64IMXVTCONDOPS:       # %bb.0: # %entry
+; RV64IMXVTCONDOPS-NEXT:    lui a2, %hi(select_redundant_czero_eqz_data)
+; RV64IMXVTCONDOPS-NEXT:    addi a2, a2, %lo(select_redundant_czero_eqz_data)
+; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a2, a2, a0
+; RV64IMXVTCONDOPS-NEXT:    or a0, a0, a2
+; RV64IMXVTCONDOPS-NEXT:    sd a0, 0(a1)
+; RV64IMXVTCONDOPS-NEXT:    ret
+;
+; RV32IMZICOND-LABEL: select_redundant_czero_eqz2:
+; RV32IMZICOND:       # %bb.0: # %entry
+; RV32IMZICOND-NEXT:    lui a2, %hi(select_redundant_czero_eqz_data)
+; RV32IMZICOND-NEXT:    addi a2, a2, %lo(select_redundant_czero_eqz_data)
+; RV32IMZICOND-NEXT:    czero.nez a2, a2, a0
+; RV32IMZICOND-NEXT:    or a0, a0, a2
+; RV32IMZICOND-NEXT:    sw a0, 0(a1)
+; RV32IMZICOND-NEXT:    ret
+;
+; RV64IMZICOND-LABEL: select_redundant_czero_eqz2:
+; RV64IMZICOND:       # %bb.0: # %entry
+; RV64IMZICOND-NEXT:    lui a2, %hi(select_redundant_czero_eqz_data)
+; RV64IMZICOND-NEXT:    addi a2, a2, %lo(select_redundant_czero_eqz_data)
+; RV64IMZICOND-NEXT:    czero.nez a2, a2, a0
+; RV64IMZICOND-NEXT:    or a0, a0, a2
+; RV64IMZICOND-NEXT:    sd a0, 0(a1)
+; RV64IMZICOND-NEXT:    ret
+entry:
+  %3 = icmp ne ptr %0, null
+  %4 = select i1 %3, ptr %0, ptr @select_redundant_czero_eqz_data
+  store ptr %4, ptr %1, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/printf.ll b/llvm/test/CodeGen/SPIRV/printf.ll
new file mode 100644
index 000000000000..483fc1f244e5
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/printf.ll
@@ -0,0 +1,40 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: %[[#ExtImport:]] = OpExtInstImport "OpenCL.std"
+; CHECK: %[[#Char:]] = OpTypeInt 8 0
+; CHECK: %[[#CharPtr:]] = OpTypePointer UniformConstant %[[#Char]]
+; CHECK: %[[#GV:]] = OpVariable %[[#]] UniformConstant %[[#]]
+; CHECK: OpFunction
+; CHECK: %[[#Arg1:]] = OpFunctionParameter
+; CHECK: %[[#Arg2:]] = OpFunctionParameter
+; CHECK: %[[#CastedGV:]] = OpBitcast %[[#CharPtr]] %[[#GV]]
+; CHECK-NEXT: OpExtInst %[[#]] %[[#ExtImport]] printf %[[#CastedGV]] %[[#ArgConst:]]
+; CHECK-NEXT: OpExtInst %[[#]] %[[#ExtImport]] printf %[[#CastedGV]] %[[#ArgConst]]
+; CHECK-NEXT: OpExtInst %[[#]] %[[#ExtImport]] printf %[[#Arg1]] %[[#ArgConst:]]
+; CHECK-NEXT: OpExtInst %[[#]] %[[#ExtImport]] printf %[[#Arg1]] %[[#ArgConst]]
+; CHECK-NEXT: %[[#CastedArg2:]] = OpBitcast %[[#CharPtr]] %[[#Arg2]]
+; CHECK-NEXT: OpExtInst %[[#]] %[[#ExtImport]] printf %[[#CastedArg2]] %[[#ArgConst]]
+; CHECK-NEXT: OpExtInst %[[#]] %[[#ExtImport]] printf %[[#CastedArg2]] %[[#ArgConst]]
+; CHECK: OpFunctionEnd
+
+%struct = type { [6 x i8] }
+
+@FmtStr = internal addrspace(2) constant [6 x i8] c"c=%c\0A\00", align 1
+
+define spir_kernel void @foo(ptr addrspace(2) %_arg_fmt1, ptr addrspace(2) byval(%struct) %_arg_fmt2) {
+entry:
+  %r1 = tail call spir_func i32 (ptr addrspace(2), ...) @_Z6printfPU3AS2Kcz(ptr addrspace(2) @FmtStr, i8 signext 97)
+  %r2 = tail call spir_func i32 (ptr addrspace(2), ...) @_Z18__spirv_ocl_printfPU3AS2Kcz(ptr addrspace(2) @FmtStr, i8 signext 97)
+  %r3 = tail call spir_func i32 (ptr addrspace(2), ...) @_Z6printfPU3AS2Kcz(ptr addrspace(2) %_arg_fmt1, i8 signext 97)
+  %r4 = tail call spir_func i32 (ptr addrspace(2), ...) @_Z18__spirv_ocl_printfPU3AS2Kcz(ptr addrspace(2) %_arg_fmt1, i8 signext 97)
+  %r5 = tail call spir_func i32 (ptr addrspace(2), ...) @_Z6printfPU3AS2Kcz(ptr addrspace(2) %_arg_fmt2, i8 signext 97)
+  %r6 = tail call spir_func i32 (ptr addrspace(2), ...) @_Z18__spirv_ocl_printfPU3AS2Kcz(ptr addrspace(2) %_arg_fmt2, i8 signext 97)
+  ret void
+}
+
+declare dso_local spir_func i32 @_Z6printfPU3AS2Kcz(ptr addrspace(2), ...)
+declare dso_local spir_func i32 @_Z18__spirv_ocl_printfPU3AS2Kcz(ptr addrspace(2), ...)
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/spirv-event-null.ll b/llvm/test/CodeGen/SPIRV/transcoding/spirv-event-null.ll
new file mode 100644
index 000000000000..fe0d96f2773e
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/transcoding/spirv-event-null.ll
@@ -0,0 +1,33 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[#TyEvent:]] = OpTypeEvent
+; CHECK-DAG: %[[#TyStruct:]] = OpTypeStruct %[[#TyEvent]]
+; CHECK-DAG: %[[#ConstEvent:]] = OpConstantNull %[[#TyEvent]]
+; CHECK-DAG: %[[#TyEventPtr:]] = OpTypePointer Function %[[#TyEvent]]
+; CHECK-DAG: %[[#TyStructPtr:]] = OpTypePointer Function %[[#TyStruct]]
+; CHECK: OpFunction
+; CHECK: OpFunctionParameter
+; CHECK: %[[#Src:]] = OpFunctionParameter
+; CHECK: OpVariable %[[#TyStructPtr]] Function
+; CHECK: %[[#EventVar:]] = OpVariable %[[#TyEventPtr]] Function
+; CHECK: %[[#Dest:]] = OpInBoundsPtrAccessChain
+; CHECK: %[[#CopyRes:]] = OpGroupAsyncCopy %[[#TyEvent]] %[[#]] %[[#Dest]] %[[#Src]] %[[#]] %[[#]] %[[#ConstEvent]]
+; CHECK: OpStore %[[#EventVar]] %[[#CopyRes]]
+
+%"class.sycl::_V1::device_event" = type { target("spirv.Event") }
+
+define spir_kernel void @foo(ptr addrspace(1) %_arg_out_ptr, ptr addrspace(3) noundef %_arg_local_acc) {
+entry:
+  %var = alloca %"class.sycl::_V1::device_event"
+  %dev_event.i.sroa.0 = alloca target("spirv.Event")
+  %add.ptr.i26 = getelementptr inbounds i32, ptr addrspace(1) %_arg_out_ptr, i64 0
+  %call3.i = tail call spir_func target("spirv.Event") @_Z22__spirv_GroupAsyncCopyjPU3AS1iPU3AS3Kimm9ocl_event(i32 2, ptr addrspace(1) %add.ptr.i26, ptr addrspace(3) %_arg_local_acc, i64 16, i64 10, target("spirv.Event") zeroinitializer)
+  store target("spirv.Event") %call3.i, ptr %dev_event.i.sroa.0
+  ret void
+}
+
+declare dso_local spir_func target("spirv.Event") @_Z22__spirv_GroupAsyncCopyjPU3AS1iPU3AS3Kimm9ocl_event(i32, ptr addrspace(1), ptr addrspace(3), i64, i64, target("spirv.Event"))
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-08.ll b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
index 4d914e3ea0e1..83050ef87591 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
@@ -2,8 +2,8 @@
 ; loads with a bitcast, and this test case gets converted into that form as
 ; well by the AtomicExpand pass.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefixes=CHECK,BASE %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck -check-prefixes=CHECK,Z13 %s
 
 define void @f1(ptr %ret, ptr %src) {
 ; CHECK-LABEL: f1:
@@ -17,6 +17,34 @@ define void @f1(ptr %ret, ptr %src) {
   ret void
 }
 
+define void @f1_fpuse(ptr %ret, ptr %src) {
+; CHECK-LABEL: f1_fpuse:
+; CHECK:       # %bb.0:
+; BASE-NEXT: aghi	%r15, -176
+; BASE-NEXT: .cfi_def_cfa_offset 336
+
+; CHECK-NEXT:	lpq	%r0, 0(%r3)
+
+; BASE-NEXT: stg %r1, 168(%r15)
+; BASE-NEXT: stg %r0, 160(%r15)
+; BASE-NEXT: ld	%f0, 160(%r15)
+; BASE-NEXT: ld	%f2, 168(%r15)
+
+; Z13-NEXT: vlvgp %v0, %r0, %r1
+; Z13-NEXT: vrepg %v2, %v0, 1
+
+; CHECK-NEXT:	axbr	%f0, %f0
+; CHECK-NEXT:	std	%f0, 0(%r2)
+; CHECK-NEXT:	std	%f2, 8(%r2)
+; BASE-NEXT:	aghi	%r15, 176
+; CHECK-NEXT:	br	%r14
+
+  %val = load atomic fp128, ptr %src seq_cst, align 16
+  %use = fadd fp128 %val, %val
+  store fp128 %use, ptr %ret, align 8
+  ret void
+}
+
 define void @f2(ptr %ret, ptr %src) {
 ; CHECK-LABEL: f2:
 ; CHECK: brasl %r14, __atomic_load@PLT
@@ -24,3 +52,19 @@ define void @f2(ptr %ret, ptr %src) {
   store fp128 %val, ptr %ret, align 8
   ret void
 }
+
+define void @f2_fpuse(ptr %ret, ptr %src) {
+; CHECK-LABEL: f2_fpuse:
+; CHECK: brasl %r14, __atomic_load@PLT
+; CHECK-NEXT:   ld	%f0, 160(%r15)
+; CHECK-NEXT:	ld	%f2, 168(%r15)
+; CHECK-NEXT:	axbr	%f0, %f0
+; CHECK-NEXT:	std	%f0, 0(%r13)
+; CHECK-NEXT:	std	%f2, 8(%r13)
+; CHECK-NEXT:	lmg	%r13, %r15, 280(%r15)
+; CHECK-NEXT:	br	%r14
+  %val = load atomic fp128, ptr %src seq_cst, align 8
+  %use = fadd fp128 %val, %val
+  store fp128 %use, ptr %ret, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-08.ll b/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
index f7f4f4d967db..4d1693477f01 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
@@ -1,8 +1,8 @@
 ; Test long double atomic stores. The atomic store is converted to i128 by
 ; the AtomicExpand pass.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefixes=CHECK,BASE %s
+; xUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck -check-prefixes=CHECK,Z13 %s
 
 define void @f1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: f1:
@@ -17,6 +17,29 @@ define void @f1(ptr %dst, ptr %src) {
   ret void
 }
 
+define void @f1_fpsrc(ptr %dst, ptr %src) {
+; CHECK-LABEL: f1_fpsrc:
+; CHECK:       # %bb.0:
+; CHECK-NEXT: ld	%f0, 0(%r3)
+; CHECK-NEXT: ld	%f2, 8(%r3)
+; CHECK-NEXT: axbr	%f0, %f0
+
+; BASE-NEXT: lgdr	%r1, %f2
+; BASE-NEXT: lgdr	%r0, %f0
+
+; Z13-NEXT: vmrhg	%v0, %v0, %v2
+; Z13-NEXT: vlgvg	%r1, %v0, 1
+; Z13-NEXT: vlgvg	%r0, %v0, 0
+
+; CHECK-NEXT: stpq	%r0, 0(%r2)
+; CHECK-NEXT: bcr	15, %r0
+; CHECK-NEXT: br	%r14
+  %val = load fp128, ptr %src, align 8
+  %add = fadd fp128 %val, %val
+  store atomic fp128 %add, ptr %dst seq_cst, align 16
+  ret void
+}
+
 define void @f2(ptr %dst, ptr %src) {
 ; CHECK-LABEL: f2:
 ; CHECK: brasl %r14, __atomic_store@PLT
@@ -24,3 +47,27 @@ define void @f2(ptr %dst, ptr %src) {
   store atomic fp128 %val, ptr %dst seq_cst, align 8
   ret void
 }
+
+define void @f2_fpuse(ptr %dst, ptr %src) {
+; CHECK-LABEL: f2_fpuse:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:	stmg	%r14, %r15, 112(%r15)
+; CHECK-NEXT:	.cfi_offset %r14, -48
+; CHECK-NEXT:	.cfi_offset %r15, -40
+; CHECK-NEXT:	aghi	%r15, -176
+; CHECK-NEXT:	.cfi_def_cfa_offset 336
+; CHECK-NEXT:	ld	%f0, 0(%r3)
+; CHECK-NEXT:	ld	%f2, 8(%r3)
+; CHECK-NEXT:	lgr	%r3, %r2
+; CHECK-NEXT:	axbr	%f0, %f0
+; CHECK-NEXT:	la	%r4, 160(%r15)
+; CHECK-NEXT:	lghi	%r2, 16
+; CHECK-NEXT:	lhi	%r5, 5
+; CHECK-NEXT:	std	%f0, 160(%r15)
+; CHECK-NEXT:	std	%f2, 168(%r15)
+; CHECK-NEXT: brasl %r14, __atomic_store@PLT
+  %val = load fp128, ptr %src, align 8
+  %add = fadd fp128 %val, %val
+  store atomic fp128 %add, ptr %dst seq_cst, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-07.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-07.ll
index 18fa89e6ca6c..f5d8dc092a7e 100644
--- a/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-07.ll
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-07.ll
@@ -26,3 +26,45 @@ define void @f1(ptr align 16 %ret, ptr align 16 %src, ptr align 16 %b) {
   store fp128 %res, ptr %ret, align 16
   ret void
 }
+
+define void @f1_fpuse(ptr align 16 %ret, ptr align 16 %src, ptr align 16 %b) {
+; CHECK-LABEL: f1_fpuse:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    stmg %r12, %r15, 96(%r15)
+; CHECK-NEXT:    .cfi_offset %r12, -64
+; CHECK-NEXT:    .cfi_offset %r13, -56
+; CHECK-NEXT:    .cfi_offset %r15, -40
+; CHECK-NEXT:    aghi %r15, -176
+; CHECK-NEXT:    .cfi_def_cfa_offset 336
+; CHECK-NEXT:    ld %f0, 0(%r4)
+; CHECK-NEXT:    ld %f2, 8(%r4)
+; CHECK-NEXT:    lg %r0, 8(%r3)
+; CHECK-NEXT:    lg %r1, 0(%r3)
+; CHECK-NEXT:    axbr %f0, %f0
+; CHECK-NEXT:    lgdr %r5, %f2
+; CHECK-NEXT:    lgdr %r4, %f0
+; CHECK-NEXT:  .LBB1_1: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    lgr %r12, %r1
+; CHECK-NEXT:    lgr %r13, %r0
+; CHECK-NEXT:    cdsg %r12, %r4, 0(%r3)
+; CHECK-NEXT:    lgr %r0, %r13
+; CHECK-NEXT:    lgr %r1, %r12
+; CHECK-NEXT:    jl .LBB1_1
+; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-NEXT:    stg %r1, 160(%r15)
+; CHECK-NEXT:    stg %r0, 168(%r15)
+; CHECK-NEXT:    ld %f0, 160(%r15)
+; CHECK-NEXT:    ld %f2, 168(%r15)
+; CHECK-NEXT:    axbr %f0, %f0
+; CHECK-NEXT:    std %f0, 0(%r2)
+; CHECK-NEXT:    std %f2, 8(%r2)
+; CHECK-NEXT:    lmg %r12, %r15, 272(%r15)
+; CHECK-NEXT:    br %r14
+  %val = load fp128, ptr %b, align 16
+  %add.src = fadd fp128 %val, %val
+  %res = atomicrmw xchg ptr %src, fp128 %add.src seq_cst
+  %res.x2 = fadd fp128 %res, %res
+  store fp128 %res.x2, ptr %ret, align 16
+  ret void
+}
diff --git a/llvm/test/CodeGen/WebAssembly/unreachable.ll b/llvm/test/CodeGen/WebAssembly/unreachable.ll
index 5368c2ba5b8d..ccac31a9af4a 100644
--- a/llvm/test/CodeGen/WebAssembly/unreachable.ll
+++ b/llvm/test/CodeGen/WebAssembly/unreachable.ll
@@ -30,7 +30,6 @@ define void @trap_ret_void() {
 ; CHECK:         .functype trap_ret_void () -> ()
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    unreachable
-; CHECK-NEXT:    # fallthrough-return
 ; CHECK-NEXT:    end_function
   call void @llvm.trap()
   ret void
@@ -54,7 +53,6 @@ define void @trap_unreacheable() {
 ; CHECK:         .functype trap_unreacheable () -> ()
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    unreachable
-; CHECK-NEXT:    unreachable
 ; CHECK-NEXT:    end_function
   call void @llvm.trap()
   unreachable
@@ -94,3 +92,12 @@ define i32 @missing_ret_noreturn_unreachable() {
   call void @ext_never_return()
   unreachable
 }
+
+define i32 @no_crash_for_other_instruction_after_trap(ptr %p, i32 %b) {
+; CHECK-LABEL: no_crash_for_other_instruction_after_trap:
+; CHECK:      unreachable
+; CHECK-NEXT: end_function
+  %a = load i32, ptr %p
+  call void @llvm.trap()
+  ret i32 %a
+}
diff --git a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
index 4686361ad2fc..a0085afbaf02 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
@@ -44,12 +44,8 @@ define dso_local void @test1(ptr%buf) nounwind {
 ; CHECK-NEXT:    tileloadd 3024(%rsp,%rax), %tmm3 # 1024-byte Folded Reload
 ; CHECK-NEXT:    tileloadd (%rbx,%r15), %tmm0
 ; CHECK-NEXT:    tileloadd (%rbx,%r15), %tmm1
-; CHECK-NEXT:    # implicit-def: $rax
-; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movabsq $64, %rax
 ; CHECK-NEXT:    tilestored %tmm3, 1024(%rsp,%rax) # 1024-byte Folded Spill
 ; CHECK-NEXT:    tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm2 # 1024-byte Folded Reload
-; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; CHECK-NEXT:    tdpbssd %tmm1, %tmm0, %tmm2
 ; CHECK-NEXT:    tilestored %tmm2, (%rbx,%r15)
 ; CHECK-NEXT:    incl %r14d
@@ -111,16 +107,10 @@ define dso_local void @test1(ptr%buf) nounwind {
 ; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x9c,0x04,0xd0,0x0b,0x00,0x00]
 ; EGPR-NEXT:    tileloadd (%rbx,%r15), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x04,0x3b]
 ; EGPR-NEXT:    tileloadd (%rbx,%r15), %tmm1 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x0c,0x3b]
-; EGPR-NEXT:    # implicit-def: $rax
-; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    # encoding: [0x48,0x89,0x84,0x24,0xb8,0x03,0x00,0x00]
-; EGPR-NEXT:    movabsq $64, %rax # encoding: [0x48,0xb8,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 ; EGPR-NEXT:    tilestored %tmm3, 1024(%rsp,%rax) # 1024-byte Folded Spill
 ; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x9c,0x04,0x00,0x04,0x00,0x00]
 ; EGPR-NEXT:    tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm2 # 1024-byte Folded Reload
 ; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x94,0x24,0x00,0x04,0x00,0x00]
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; EGPR-NEXT:    # encoding: [0x48,0x8b,0x84,0x24,0xb8,0x03,0x00,0x00]
 ; EGPR-NEXT:    tdpbssd %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x5e,0xd0]
 ; EGPR-NEXT:    tilestored %tmm2, (%rbx,%r15) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x14,0x3b]
 ; EGPR-NEXT:    incl %r14d # encoding: [0x41,0xff,0xc6]
diff --git a/llvm/test/CodeGen/X86/abdu-vector-128.ll b/llvm/test/CodeGen/X86/abdu-vector-128.ll
index dd180b67e492..0c33e8973c2d 100644
--- a/llvm/test/CodeGen/X86/abdu-vector-128.ll
+++ b/llvm/test/CodeGen/X86/abdu-vector-128.ll
@@ -715,43 +715,41 @@ define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwin
 ;
 ; SSE42-LABEL: abd_cmp_v2i64_multiuse_cmp:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    psubq %xmm1, %xmm2
 ; SSE42-NEXT:    movdqa %xmm1, %xmm3
-; SSE42-NEXT:    pxor %xmm2, %xmm3
-; SSE42-NEXT:    pxor %xmm0, %xmm2
-; SSE42-NEXT:    pcmpgtq %xmm3, %xmm2
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    psubq %xmm1, %xmm3
-; SSE42-NEXT:    psubq %xmm0, %xmm1
-; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT:    paddq %xmm1, %xmm2
-; SSE42-NEXT:    movdqa %xmm2, %xmm0
+; SSE42-NEXT:    psubq %xmm0, %xmm3
+; SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT:    pxor %xmm4, %xmm1
+; SSE42-NEXT:    pxor %xmm4, %xmm0
+; SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; SSE42-NEXT:    paddq %xmm3, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_cmp_v2i64_multiuse_cmp:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    # xmm2 = mem[0,0]
-; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
-; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm3
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    # xmm4 = mem[0,0]
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vblendvpd %xmm0, %xmm2, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_cmp_v2i64_multiuse_cmp:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
-; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
-; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm3
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vblendvpd %xmm0, %xmm2, %xmm3, %xmm1
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_cmp_v2i64_multiuse_cmp:
diff --git a/llvm/test/CodeGen/X86/apx/kmov-postrapseudos.ll b/llvm/test/CodeGen/X86/apx/kmov-postrapseudos.ll
index 017024c173c3..b2cb2c3e04b3 100644
--- a/llvm/test/CodeGen/X86/apx/kmov-postrapseudos.ll
+++ b/llvm/test/CodeGen/X86/apx/kmov-postrapseudos.ll
@@ -52,10 +52,7 @@ alloca_21:
 define i32 @kmovrk_1(<4 x ptr> %arg) {
 ; AVX512-LABEL: kmovrk_1:
 ; AVX512:       # %bb.0: # %bb
-; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT:    vptestmq %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc0]
-; AVX512-NEXT:    kmovw %k0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x93,0xc0]
-; AVX512-NEXT:    testb $15, %al # encoding: [0xa8,0x0f]
+; AVX512-NEXT:    vptest %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x17,0xc0]
 ; AVX512-NEXT:    jne .LBB2_1 # encoding: [0x75,A]
 ; AVX512-NEXT:    # fixup A - offset: 1, value: .LBB2_1-1, kind: FK_PCRel_1
 ; AVX512-NEXT:  # %bb.2: # %bb3
@@ -66,10 +63,7 @@ define i32 @kmovrk_1(<4 x ptr> %arg) {
 ;
 ; AVX512BW-LABEL: kmovrk_1:
 ; AVX512BW:       # %bb.0: # %bb
-; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vptestmq %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc0]
-; AVX512BW-NEXT:    kmovd %k0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x93,0xc0]
-; AVX512BW-NEXT:    testb $15, %al # encoding: [0xa8,0x0f]
+; AVX512BW-NEXT:    vptest %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x17,0xc0]
 ; AVX512BW-NEXT:    jne .LBB2_1 # encoding: [0x75,A]
 ; AVX512BW-NEXT:    # fixup A - offset: 1, value: .LBB2_1-1, kind: FK_PCRel_1
 ; AVX512BW-NEXT:  # %bb.2: # %bb3
diff --git a/llvm/test/CodeGen/X86/avgceils.ll b/llvm/test/CodeGen/X86/avgceils.ll
index 4529ea275df9..f44f98c2a41a 100644
--- a/llvm/test/CodeGen/X86/avgceils.ll
+++ b/llvm/test/CodeGen/X86/avgceils.ll
@@ -9,7 +9,7 @@
 ; 128-bit vectors
 ;
 
-define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -65,7 +65,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
@@ -165,7 +165,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -190,7 +190,7 @@ define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
@@ -289,7 +289,7 @@ define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v4i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -314,7 +314,7 @@ define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
@@ -410,7 +410,7 @@ define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_fixed_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
@@ -472,7 +472,7 @@ define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movq %xmm0, %rax
@@ -574,7 +574,7 @@ define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
 ; 256-bit vectors
 ;
 
-define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -649,7 +649,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v32i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
@@ -806,7 +806,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -858,7 +858,7 @@ define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
@@ -1014,7 +1014,7 @@ define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -1066,7 +1066,7 @@ define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
@@ -1218,7 +1218,7 @@ define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_fixed_v4i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
@@ -1306,27 +1306,15 @@ define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    pushq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    pushq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    pushq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    pushq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
-; SSE2-NEXT:    .cfi_offset %rbx, -56
-; SSE2-NEXT:    .cfi_offset %r12, -48
-; SSE2-NEXT:    .cfi_offset %r13, -40
-; SSE2-NEXT:    .cfi_offset %r14, -32
-; SSE2-NEXT:    .cfi_offset %r15, -24
-; SSE2-NEXT:    .cfi_offset %rbp, -16
 ; SSE2-NEXT:    movq %xmm0, %r11
 ; SSE2-NEXT:    movq %r11, %r12
 ; SSE2-NEXT:    sarq $63, %r12
@@ -1382,39 +1370,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; SSE2-NEXT:    popq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    popq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    popq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    popq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    popq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    popq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_ext_v4i64:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
-; SSE4-NEXT:    .cfi_offset %rbx, -56
-; SSE4-NEXT:    .cfi_offset %r12, -48
-; SSE4-NEXT:    .cfi_offset %r13, -40
-; SSE4-NEXT:    .cfi_offset %r14, -32
-; SSE4-NEXT:    .cfi_offset %r15, -24
-; SSE4-NEXT:    .cfi_offset %rbp, -16
 ; SSE4-NEXT:    pextrq $1, %xmm0, %r11
 ; SSE4-NEXT:    movq %r11, %r12
 ; SSE4-NEXT:    sarq $63, %r12
@@ -1466,39 +1436,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; SSE4-NEXT:    popq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    popq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    popq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    popq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    popq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    popq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 8
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_ext_v4i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    pushq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    pushq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    pushq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    pushq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    pushq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
-; AVX1-NEXT:    .cfi_offset %rbx, -56
-; AVX1-NEXT:    .cfi_offset %r12, -48
-; AVX1-NEXT:    .cfi_offset %r13, -40
-; AVX1-NEXT:    .cfi_offset %r14, -32
-; AVX1-NEXT:    .cfi_offset %r15, -24
-; AVX1-NEXT:    .cfi_offset %rbp, -16
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vpextrq $1, %xmm2, %r11
 ; AVX1-NEXT:    movq %r11, %r12
@@ -1553,39 +1505,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    popq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    popq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    popq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    popq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    popq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_ext_v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
-; AVX2-NEXT:    .cfi_offset %rbx, -56
-; AVX2-NEXT:    .cfi_offset %r12, -48
-; AVX2-NEXT:    .cfi_offset %r13, -40
-; AVX2-NEXT:    .cfi_offset %r14, -32
-; AVX2-NEXT:    .cfi_offset %r15, -24
-; AVX2-NEXT:    .cfi_offset %rbp, -16
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX2-NEXT:    vpextrq $1, %xmm2, %r11
 ; AVX2-NEXT:    movq %r11, %r12
@@ -1640,39 +1574,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_ext_v4i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
-; AVX512-NEXT:    .cfi_offset %rbx, -56
-; AVX512-NEXT:    .cfi_offset %r12, -48
-; AVX512-NEXT:    .cfi_offset %r13, -40
-; AVX512-NEXT:    .cfi_offset %r14, -32
-; AVX512-NEXT:    .cfi_offset %r15, -24
-; AVX512-NEXT:    .cfi_offset %rbp, -16
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vpextrq $1, %xmm2, %r11
 ; AVX512-NEXT:    movq %r11, %r12
@@ -1727,17 +1643,11 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512-NEXT:    retq
   %x0 = sext <4 x i64> %a0 to <4 x i128>
   %x1 = sext <4 x i64> %a1 to <4 x i128>
@@ -1752,7 +1662,7 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; 512-bit vectors
 ;
 
-define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v64i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm11
@@ -1864,7 +1774,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v64i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm3, %xmm8
@@ -2144,7 +2054,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -2220,7 +2130,7 @@ define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v32i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
@@ -2498,7 +2408,7 @@ define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -2574,7 +2484,7 @@ define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm0[2,3,2,3]
@@ -2848,7 +2758,7 @@ define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_fixed_v8i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm3, %xmm8
@@ -2985,29 +2895,16 @@ define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    pushq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    pushq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    pushq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    pushq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
 ; SSE2-NEXT:    pushq %rax
-; SSE2-NEXT:    .cfi_def_cfa_offset 64
-; SSE2-NEXT:    .cfi_offset %rbx, -56
-; SSE2-NEXT:    .cfi_offset %r12, -48
-; SSE2-NEXT:    .cfi_offset %r13, -40
-; SSE2-NEXT:    .cfi_offset %r14, -32
-; SSE2-NEXT:    .cfi_offset %r15, -24
-; SSE2-NEXT:    .cfi_offset %rbp, -16
 ; SSE2-NEXT:    movq %xmm0, %rax
 ; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE2-NEXT:    sarq $63, %rax
@@ -3137,43 +3034,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE2-NEXT:    addq $8, %rsp
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
 ; SSE2-NEXT:    popq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    popq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    popq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    popq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    popq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    popq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_ext_v8i64:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
 ; SSE4-NEXT:    subq $16, %rsp
-; SSE4-NEXT:    .cfi_def_cfa_offset 72
-; SSE4-NEXT:    .cfi_offset %rbx, -56
-; SSE4-NEXT:    .cfi_offset %r12, -48
-; SSE4-NEXT:    .cfi_offset %r13, -40
-; SSE4-NEXT:    .cfi_offset %r14, -32
-; SSE4-NEXT:    .cfi_offset %r15, -24
-; SSE4-NEXT:    .cfi_offset %rbp, -16
 ; SSE4-NEXT:    pextrq $1, %xmm0, %rax
 ; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE4-NEXT:    sarq $63, %rax
@@ -3301,43 +3178,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE4-NEXT:    addq $16, %rsp
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
 ; SSE4-NEXT:    popq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    popq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    popq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    popq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    popq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    popq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 8
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_ext_v8i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    pushq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    pushq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    pushq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    pushq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    pushq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
 ; AVX1-NEXT:    pushq %rax
-; AVX1-NEXT:    .cfi_def_cfa_offset 64
-; AVX1-NEXT:    .cfi_offset %rbx, -56
-; AVX1-NEXT:    .cfi_offset %r12, -48
-; AVX1-NEXT:    .cfi_offset %r13, -40
-; AVX1-NEXT:    .cfi_offset %r14, -32
-; AVX1-NEXT:    .cfi_offset %r15, -24
-; AVX1-NEXT:    .cfi_offset %rbp, -16
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
 ; AVX1-NEXT:    vpextrq $1, %xmm4, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -3465,43 +3322,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    addq $8, %rsp
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
 ; AVX1-NEXT:    popq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    popq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    popq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    popq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    popq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_ext_v8i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
 ; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    .cfi_def_cfa_offset 64
-; AVX2-NEXT:    .cfi_offset %rbx, -56
-; AVX2-NEXT:    .cfi_offset %r12, -48
-; AVX2-NEXT:    .cfi_offset %r13, -40
-; AVX2-NEXT:    .cfi_offset %r14, -32
-; AVX2-NEXT:    .cfi_offset %r15, -24
-; AVX2-NEXT:    .cfi_offset %rbp, -16
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
 ; AVX2-NEXT:    vpextrq $1, %xmm4, %rax
 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -3629,43 +3466,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX2-NEXT:    addq $8, %rsp
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_ext_v8i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
 ; AVX512-NEXT:    pushq %rax
-; AVX512-NEXT:    .cfi_def_cfa_offset 64
-; AVX512-NEXT:    .cfi_offset %rbx, -56
-; AVX512-NEXT:    .cfi_offset %r12, -48
-; AVX512-NEXT:    .cfi_offset %r13, -40
-; AVX512-NEXT:    .cfi_offset %r14, -32
-; AVX512-NEXT:    .cfi_offset %r15, -24
-; AVX512-NEXT:    .cfi_offset %rbp, -16
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512-NEXT:    vpextrq $1, %xmm3, %rax
@@ -3796,19 +3613,12 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    addq $8, %rsp
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
 ; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512-NEXT:    retq
   %x0 = sext <8 x i64> %a0 to <8 x i128>
   %x1 = sext <8 x i64> %a1 to <8 x i128>
diff --git a/llvm/test/CodeGen/X86/avgceilu.ll b/llvm/test/CodeGen/X86/avgceilu.ll
index dee1a5a720f9..d34894cc0fbb 100644
--- a/llvm/test/CodeGen/X86/avgceilu.ll
+++ b/llvm/test/CodeGen/X86/avgceilu.ll
@@ -9,7 +9,7 @@
 ; 128-bit vectors
 ;
 
-define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgb %xmm1, %xmm0
@@ -26,7 +26,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE-LABEL: test_ext_v16i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgb %xmm1, %xmm0
@@ -45,7 +45,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgw %xmm1, %xmm0
@@ -62,7 +62,7 @@ define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE-LABEL: test_ext_v8i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgw %xmm1, %xmm0
@@ -81,7 +81,7 @@ define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v4i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -106,7 +106,7 @@ define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
@@ -195,7 +195,7 @@ define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v2i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -220,7 +220,7 @@ define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
@@ -310,7 +310,7 @@ define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
 ; 256-bit vectors
 ;
 
-define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgb %xmm2, %xmm0
@@ -342,7 +342,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE-LABEL: test_ext_v32i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgb %xmm2, %xmm0
@@ -376,7 +376,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgw %xmm2, %xmm0
@@ -408,7 +408,7 @@ define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE-LABEL: test_ext_v16i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgw %xmm2, %xmm0
@@ -442,7 +442,7 @@ define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -494,7 +494,7 @@ define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
@@ -629,7 +629,7 @@ define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v4i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -681,7 +681,7 @@ define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
@@ -937,7 +937,7 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; 512-bit vectors
 ;
 
-define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v64i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgb %xmm4, %xmm0
@@ -977,7 +977,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE-LABEL: test_ext_v64i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgb %xmm4, %xmm0
@@ -1019,7 +1019,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgw %xmm4, %xmm0
@@ -1059,7 +1059,7 @@ define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE-LABEL: test_ext_v32i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pavgw %xmm4, %xmm0
@@ -1101,7 +1101,7 @@ define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -1177,7 +1177,7 @@ define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm2, %xmm8
@@ -1413,7 +1413,7 @@ define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -1489,27 +1489,15 @@ define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    pushq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    pushq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    pushq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    pushq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
-; SSE2-NEXT:    .cfi_offset %rbx, -56
-; SSE2-NEXT:    .cfi_offset %r12, -48
-; SSE2-NEXT:    .cfi_offset %r13, -40
-; SSE2-NEXT:    .cfi_offset %r14, -32
-; SSE2-NEXT:    .cfi_offset %r15, -24
-; SSE2-NEXT:    .cfi_offset %rbp, -16
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm8, %rcx
 ; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1617,39 +1605,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE2-NEXT:    popq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    popq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    popq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    popq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    popq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    popq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_ext_v8i64:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
-; SSE4-NEXT:    .cfi_offset %rbx, -56
-; SSE4-NEXT:    .cfi_offset %r12, -48
-; SSE4-NEXT:    .cfi_offset %r13, -40
-; SSE4-NEXT:    .cfi_offset %r14, -32
-; SSE4-NEXT:    .cfi_offset %r15, -24
-; SSE4-NEXT:    .cfi_offset %rbp, -16
 ; SSE4-NEXT:    movq %xmm3, %rcx
 ; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE4-NEXT:    movq %xmm7, %rdx
@@ -1747,39 +1717,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE4-NEXT:    popq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    popq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    popq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    popq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    popq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    popq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 8
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_ext_v8i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    pushq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    pushq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    pushq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    pushq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    pushq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
-; AVX1-NEXT:    .cfi_offset %rbx, -56
-; AVX1-NEXT:    .cfi_offset %r12, -48
-; AVX1-NEXT:    .cfi_offset %r13, -40
-; AVX1-NEXT:    .cfi_offset %r14, -32
-; AVX1-NEXT:    .cfi_offset %r15, -24
-; AVX1-NEXT:    .cfi_offset %rbp, -16
 ; AVX1-NEXT:    vmovq %xmm1, %rcx
 ; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    vmovq %xmm3, %rdx
@@ -1885,39 +1837,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    popq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    popq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    popq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    popq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    popq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_ext_v8i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
-; AVX2-NEXT:    .cfi_offset %rbx, -56
-; AVX2-NEXT:    .cfi_offset %r12, -48
-; AVX2-NEXT:    .cfi_offset %r13, -40
-; AVX2-NEXT:    .cfi_offset %r14, -32
-; AVX2-NEXT:    .cfi_offset %r15, -24
-; AVX2-NEXT:    .cfi_offset %rbp, -16
 ; AVX2-NEXT:    vmovq %xmm1, %rcx
 ; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    vmovq %xmm3, %rdx
@@ -2023,39 +1957,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_ext_v8i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
-; AVX512-NEXT:    .cfi_offset %rbx, -56
-; AVX512-NEXT:    .cfi_offset %r12, -48
-; AVX512-NEXT:    .cfi_offset %r13, -40
-; AVX512-NEXT:    .cfi_offset %r14, -32
-; AVX512-NEXT:    .cfi_offset %r15, -24
-; AVX512-NEXT:    .cfi_offset %rbp, -16
 ; AVX512-NEXT:    vmovq %xmm0, %rcx
 ; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    vmovq %xmm1, %rdx
@@ -2164,17 +2080,11 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512-NEXT:    retq
   %x0 = zext <8 x i64> %a0 to <8 x i128>
   %x1 = zext <8 x i64> %a1 to <8 x i128>
diff --git a/llvm/test/CodeGen/X86/avgfloors.ll b/llvm/test/CodeGen/X86/avgfloors.ll
index a3864ab4bb44..efee831a15c7 100644
--- a/llvm/test/CodeGen/X86/avgfloors.ll
+++ b/llvm/test/CodeGen/X86/avgfloors.ll
@@ -9,7 +9,7 @@
 ; 128-bit vectors
 ;
 
-define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -64,7 +64,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
@@ -150,7 +150,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -174,7 +174,7 @@ define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
@@ -259,7 +259,7 @@ define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v4i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -283,7 +283,7 @@ define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
@@ -365,7 +365,7 @@ define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_fixed_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
@@ -425,7 +425,7 @@ define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
@@ -514,7 +514,7 @@ define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
 ; 256-bit vectors
 ;
 
-define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -587,7 +587,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v32i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
@@ -723,7 +723,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -773,7 +773,7 @@ define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
@@ -908,7 +908,7 @@ define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -958,7 +958,7 @@ define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
@@ -1089,7 +1089,7 @@ define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_fixed_v4i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
@@ -1173,27 +1173,15 @@ define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    pushq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    pushq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    pushq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    pushq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
-; SSE2-NEXT:    .cfi_offset %rbx, -56
-; SSE2-NEXT:    .cfi_offset %r12, -48
-; SSE2-NEXT:    .cfi_offset %r13, -40
-; SSE2-NEXT:    .cfi_offset %r14, -32
-; SSE2-NEXT:    .cfi_offset %r15, -24
-; SSE2-NEXT:    .cfi_offset %rbp, -16
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm4, %rdx
 ; SSE2-NEXT:    movq %rdx, %r14
@@ -1241,39 +1229,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; SSE2-NEXT:    popq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    popq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    popq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    popq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    popq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    popq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_ext_v4i64:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
-; SSE4-NEXT:    .cfi_offset %rbx, -56
-; SSE4-NEXT:    .cfi_offset %r12, -48
-; SSE4-NEXT:    .cfi_offset %r13, -40
-; SSE4-NEXT:    .cfi_offset %r14, -32
-; SSE4-NEXT:    .cfi_offset %r15, -24
-; SSE4-NEXT:    .cfi_offset %rbp, -16
 ; SSE4-NEXT:    movq %xmm1, %rdi
 ; SSE4-NEXT:    movq %rdi, %r14
 ; SSE4-NEXT:    sarq $63, %r14
@@ -1317,39 +1287,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; SSE4-NEXT:    popq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    popq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    popq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    popq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    popq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    popq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 8
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_ext_v4i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    pushq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    pushq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    pushq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    pushq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    pushq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
-; AVX1-NEXT:    .cfi_offset %rbx, -56
-; AVX1-NEXT:    .cfi_offset %r12, -48
-; AVX1-NEXT:    .cfi_offset %r13, -40
-; AVX1-NEXT:    .cfi_offset %r14, -32
-; AVX1-NEXT:    .cfi_offset %r15, -24
-; AVX1-NEXT:    .cfi_offset %rbp, -16
 ; AVX1-NEXT:    vmovq %xmm0, %rdx
 ; AVX1-NEXT:    movq %rdx, %r14
 ; AVX1-NEXT:    sarq $63, %r14
@@ -1396,39 +1348,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    popq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    popq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    popq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    popq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    popq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_ext_v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
-; AVX2-NEXT:    .cfi_offset %rbx, -56
-; AVX2-NEXT:    .cfi_offset %r12, -48
-; AVX2-NEXT:    .cfi_offset %r13, -40
-; AVX2-NEXT:    .cfi_offset %r14, -32
-; AVX2-NEXT:    .cfi_offset %r15, -24
-; AVX2-NEXT:    .cfi_offset %rbp, -16
 ; AVX2-NEXT:    vmovq %xmm0, %rdx
 ; AVX2-NEXT:    movq %rdx, %r14
 ; AVX2-NEXT:    sarq $63, %r14
@@ -1475,39 +1409,21 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_ext_v4i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
-; AVX512-NEXT:    .cfi_offset %rbx, -56
-; AVX512-NEXT:    .cfi_offset %r12, -48
-; AVX512-NEXT:    .cfi_offset %r13, -40
-; AVX512-NEXT:    .cfi_offset %r14, -32
-; AVX512-NEXT:    .cfi_offset %r15, -24
-; AVX512-NEXT:    .cfi_offset %rbp, -16
 ; AVX512-NEXT:    vmovq %xmm0, %rdx
 ; AVX512-NEXT:    movq %rdx, %r14
 ; AVX512-NEXT:    sarq $63, %r14
@@ -1554,17 +1470,11 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512-NEXT:    retq
   %x0 = sext <4 x i64> %a0 to <4 x i128>
   %x1 = sext <4 x i64> %a1 to <4 x i128>
@@ -1578,7 +1488,7 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; 512-bit vectors
 ;
 
-define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v64i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm10
@@ -1690,7 +1600,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v64i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
@@ -1934,7 +1844,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -2007,7 +1917,7 @@ define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v32i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm3, %xmm9
@@ -2251,7 +2161,7 @@ define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -2324,7 +2234,7 @@ define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm8, %xmm8
@@ -2561,7 +2471,7 @@ define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_fixed_v8i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm3, %xmm11
@@ -2698,29 +2608,16 @@ define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    pushq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    pushq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    pushq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    pushq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
 ; SSE2-NEXT:    pushq %rax
-; SSE2-NEXT:    .cfi_def_cfa_offset 64
-; SSE2-NEXT:    .cfi_offset %rbx, -56
-; SSE2-NEXT:    .cfi_offset %r12, -48
-; SSE2-NEXT:    .cfi_offset %r13, -40
-; SSE2-NEXT:    .cfi_offset %r14, -32
-; SSE2-NEXT:    .cfi_offset %r15, -24
-; SSE2-NEXT:    .cfi_offset %rbp, -16
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm8, %rax
 ; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2832,43 +2729,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE2-NEXT:    addq $8, %rsp
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
 ; SSE2-NEXT:    popq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    popq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    popq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    popq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    popq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    popq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_ext_v8i64:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
 ; SSE4-NEXT:    pushq %rax
-; SSE4-NEXT:    .cfi_def_cfa_offset 64
-; SSE4-NEXT:    .cfi_offset %rbx, -56
-; SSE4-NEXT:    .cfi_offset %r12, -48
-; SSE4-NEXT:    .cfi_offset %r13, -40
-; SSE4-NEXT:    .cfi_offset %r14, -32
-; SSE4-NEXT:    .cfi_offset %r15, -24
-; SSE4-NEXT:    .cfi_offset %rbp, -16
 ; SSE4-NEXT:    movq %xmm3, %rax
 ; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE4-NEXT:    movq %rax, %rcx
@@ -2972,43 +2849,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE4-NEXT:    addq $8, %rsp
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
 ; SSE4-NEXT:    popq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    popq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    popq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    popq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    popq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    popq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 8
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_ext_v8i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    pushq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    pushq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    pushq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    pushq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    pushq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
 ; AVX1-NEXT:    pushq %rax
-; AVX1-NEXT:    .cfi_def_cfa_offset 64
-; AVX1-NEXT:    .cfi_offset %rbx, -56
-; AVX1-NEXT:    .cfi_offset %r12, -48
-; AVX1-NEXT:    .cfi_offset %r13, -40
-; AVX1-NEXT:    .cfi_offset %r14, -32
-; AVX1-NEXT:    .cfi_offset %r15, -24
-; AVX1-NEXT:    .cfi_offset %rbp, -16
 ; AVX1-NEXT:    vmovq %xmm1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movq %rax, %rcx
@@ -3118,43 +2975,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    addq $8, %rsp
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
 ; AVX1-NEXT:    popq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    popq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    popq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    popq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    popq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_ext_v8i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
 ; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    .cfi_def_cfa_offset 64
-; AVX2-NEXT:    .cfi_offset %rbx, -56
-; AVX2-NEXT:    .cfi_offset %r12, -48
-; AVX2-NEXT:    .cfi_offset %r13, -40
-; AVX2-NEXT:    .cfi_offset %r14, -32
-; AVX2-NEXT:    .cfi_offset %r15, -24
-; AVX2-NEXT:    .cfi_offset %rbp, -16
 ; AVX2-NEXT:    vmovq %xmm1, %rax
 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    movq %rax, %rcx
@@ -3264,43 +3101,23 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX2-NEXT:    addq $8, %rsp
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_ext_v8i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
 ; AVX512-NEXT:    pushq %rax
-; AVX512-NEXT:    .cfi_def_cfa_offset 64
-; AVX512-NEXT:    .cfi_offset %rbx, -56
-; AVX512-NEXT:    .cfi_offset %r12, -48
-; AVX512-NEXT:    .cfi_offset %r13, -40
-; AVX512-NEXT:    .cfi_offset %r14, -32
-; AVX512-NEXT:    .cfi_offset %r15, -24
-; AVX512-NEXT:    .cfi_offset %rbp, -16
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    movq %rax, %rcx
@@ -3413,19 +3230,12 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    addq $8, %rsp
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
 ; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512-NEXT:    retq
   %x0 = sext <8 x i64> %a0 to <8 x i128>
   %x1 = sext <8 x i64> %a1 to <8 x i128>
diff --git a/llvm/test/CodeGen/X86/avgflooru.ll b/llvm/test/CodeGen/X86/avgflooru.ll
index e07c1f55991e..000457c5ab1e 100644
--- a/llvm/test/CodeGen/X86/avgflooru.ll
+++ b/llvm/test/CodeGen/X86/avgflooru.ll
@@ -9,7 +9,7 @@
 ; 128-bit vectors
 ;
 
-define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -53,7 +53,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
@@ -127,7 +127,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %res
 }
 
-define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -151,7 +151,7 @@ define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
@@ -227,7 +227,7 @@ define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 
-define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v4i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -251,7 +251,7 @@ define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
@@ -325,7 +325,7 @@ define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res
 }
 
-define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v2i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
@@ -349,7 +349,7 @@ define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
@@ -458,7 +458,7 @@ define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
 ; 256-bit vectors
 ;
 
-define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -516,7 +516,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v32i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
@@ -627,7 +627,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
   ret <32 x i8> %res
 }
 
-define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -677,7 +677,7 @@ define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
@@ -792,7 +792,7 @@ define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %res
 }
 
-define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -842,7 +842,7 @@ define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
@@ -954,7 +954,7 @@ define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret <8 x i32> %res
 }
 
-define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v4i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm1, %xmm4
@@ -1004,7 +1004,7 @@ define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
   ret <4 x i64> %res
 }
 
-define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v4i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
@@ -1199,7 +1199,7 @@ define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; 512-bit vectors
 ;
 
-define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v64i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm9
@@ -1286,7 +1286,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v64i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm8, %xmm8
@@ -1481,7 +1481,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
   ret <64 x i8> %res
 }
 
-define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v32i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -1554,7 +1554,7 @@ define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v32i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm8, %xmm8
@@ -1757,7 +1757,7 @@ define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %res
 }
 
-define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v16i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -1830,7 +1830,7 @@ define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v16i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm8, %xmm8
@@ -2027,7 +2027,7 @@ define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
   ret <16 x i32> %res
 }
 
-define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE-LABEL: test_fixed_v8i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm3, %xmm8
@@ -2100,27 +2100,15 @@ define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
   ret <8 x i64> %res
 }
 
-define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_ext_v8i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    pushq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    pushq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    pushq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    pushq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 56
-; SSE2-NEXT:    .cfi_offset %rbx, -56
-; SSE2-NEXT:    .cfi_offset %r12, -48
-; SSE2-NEXT:    .cfi_offset %r13, -40
-; SSE2-NEXT:    .cfi_offset %r14, -32
-; SSE2-NEXT:    .cfi_offset %r15, -24
-; SSE2-NEXT:    .cfi_offset %rbp, -16
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm3, %rbx
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
@@ -2194,39 +2182,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE2-NEXT:    popq %rbx
-; SSE2-NEXT:    .cfi_def_cfa_offset 48
 ; SSE2-NEXT:    popq %r12
-; SSE2-NEXT:    .cfi_def_cfa_offset 40
 ; SSE2-NEXT:    popq %r13
-; SSE2-NEXT:    .cfi_def_cfa_offset 32
 ; SSE2-NEXT:    popq %r14
-; SSE2-NEXT:    .cfi_def_cfa_offset 24
 ; SSE2-NEXT:    popq %r15
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
 ; SSE2-NEXT:    popq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_ext_v8i64:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 56
-; SSE4-NEXT:    .cfi_offset %rbx, -56
-; SSE4-NEXT:    .cfi_offset %r12, -48
-; SSE4-NEXT:    .cfi_offset %r13, -40
-; SSE4-NEXT:    .cfi_offset %r14, -32
-; SSE4-NEXT:    .cfi_offset %r15, -24
-; SSE4-NEXT:    .cfi_offset %rbp, -16
 ; SSE4-NEXT:    pextrq $1, %xmm3, %r14
 ; SSE4-NEXT:    movq %xmm2, %r13
 ; SSE4-NEXT:    pextrq $1, %xmm2, %rbp
@@ -2292,39 +2262,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
 ; SSE4-NEXT:    popq %rbx
-; SSE4-NEXT:    .cfi_def_cfa_offset 48
 ; SSE4-NEXT:    popq %r12
-; SSE4-NEXT:    .cfi_def_cfa_offset 40
 ; SSE4-NEXT:    popq %r13
-; SSE4-NEXT:    .cfi_def_cfa_offset 32
 ; SSE4-NEXT:    popq %r14
-; SSE4-NEXT:    .cfi_def_cfa_offset 24
 ; SSE4-NEXT:    popq %r15
-; SSE4-NEXT:    .cfi_def_cfa_offset 16
 ; SSE4-NEXT:    popq %rbp
-; SSE4-NEXT:    .cfi_def_cfa_offset 8
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_ext_v8i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    pushq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    pushq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    pushq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    pushq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    pushq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 56
-; AVX1-NEXT:    .cfi_offset %rbx, -56
-; AVX1-NEXT:    .cfi_offset %r12, -48
-; AVX1-NEXT:    .cfi_offset %r13, -40
-; AVX1-NEXT:    .cfi_offset %r14, -32
-; AVX1-NEXT:    .cfi_offset %r15, -24
-; AVX1-NEXT:    .cfi_offset %rbp, -16
 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rbx
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
 ; AVX1-NEXT:    vmovq %xmm4, %r15
@@ -2396,39 +2348,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    popq %rbx
-; AVX1-NEXT:    .cfi_def_cfa_offset 48
 ; AVX1-NEXT:    popq %r12
-; AVX1-NEXT:    .cfi_def_cfa_offset 40
 ; AVX1-NEXT:    popq %r13
-; AVX1-NEXT:    .cfi_def_cfa_offset 32
 ; AVX1-NEXT:    popq %r14
-; AVX1-NEXT:    .cfi_def_cfa_offset 24
 ; AVX1-NEXT:    popq %r15
-; AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_ext_v8i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 56
-; AVX2-NEXT:    .cfi_offset %rbx, -56
-; AVX2-NEXT:    .cfi_offset %r12, -48
-; AVX2-NEXT:    .cfi_offset %r13, -40
-; AVX2-NEXT:    .cfi_offset %r14, -32
-; AVX2-NEXT:    .cfi_offset %r15, -24
-; AVX2-NEXT:    .cfi_offset %rbp, -16
 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rbx
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
 ; AVX2-NEXT:    vmovq %xmm4, %r15
@@ -2500,39 +2434,21 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    .cfi_def_cfa_offset 48
 ; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    .cfi_def_cfa_offset 40
 ; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    .cfi_def_cfa_offset 32
 ; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    .cfi_def_cfa_offset 24
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
 ; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_ext_v8i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 56
-; AVX512-NEXT:    .cfi_offset %rbx, -56
-; AVX512-NEXT:    .cfi_offset %r12, -48
-; AVX512-NEXT:    .cfi_offset %r13, -40
-; AVX512-NEXT:    .cfi_offset %r14, -32
-; AVX512-NEXT:    .cfi_offset %r15, -24
-; AVX512-NEXT:    .cfi_offset %rbp, -16
 ; AVX512-NEXT:    vpextrq $1, %xmm0, %r10
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512-NEXT:    vpextrq $1, %xmm2, %r13
@@ -2607,17 +2523,11 @@ define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    .cfi_def_cfa_offset 48
 ; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    .cfi_def_cfa_offset 40
 ; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    .cfi_def_cfa_offset 32
 ; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    .cfi_def_cfa_offset 24
 ; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
 ; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512-NEXT:    retq
   %x0 = zext <8 x i64> %a0 to <8 x i128>
   %x1 = zext <8 x i64> %a1 to <8 x i128>
diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll b/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
index 13d1265a249d..7e48b3719cf0 100644
--- a/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll
@@ -30,13 +30,13 @@ define <64 x i8> @add_v64i8_broadcasts(<64 x i8> %a0, i64 %a1, i8 %a2) {
 ; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512F-NEXT:    vpternlogq $216, %zmm2, %zmm1, %zmm0
-; AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm3
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm1, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm4
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm4
+; AVX512F-NEXT:    vpaddb %ymm1, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddb %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpternlogq $226, %zmm4, %zmm2, %zmm0
 ; AVX512F-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/avx512bwvl-arith.ll b/llvm/test/CodeGen/X86/avx512bwvl-arith.ll
index 4988fc35b10e..33819c9e0102 100644
--- a/llvm/test/CodeGen/X86/avx512bwvl-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512bwvl-arith.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,-evex512 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,EVEX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,-evex512 | FileCheck %s --check-prefixes=CHECK,EVEX256
 
 ; 256-bit
 
@@ -236,3 +236,34 @@ define <8 x i16> @vpmullw128_test(<8 x i16> %i, <8 x i16> %j) {
   ret <8 x i16> %x
 }
 
+define i16 @PR90356(<16 x i1> %a) {
+; EVEX512-LABEL: PR90356:
+; EVEX512:       # %bb.0:
+; EVEX512-NEXT:    vpsllw $7, %xmm0, %xmm0
+; EVEX512-NEXT:    vpmovb2m %xmm0, %k1
+; EVEX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; EVEX512-NEXT:    movb $63, %al
+; EVEX512-NEXT:    kmovd %eax, %k1
+; EVEX512-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
+; EVEX512-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; EVEX512-NEXT:    kmovd %k0, %eax
+; EVEX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; EVEX512-NEXT:    vzeroupper
+; EVEX512-NEXT:    retq
+;
+; EVEX256-LABEL: PR90356:
+; EVEX256:       # %bb.0:
+; EVEX256-NEXT:    vpsllw $7, %xmm0, %xmm0
+; EVEX256-NEXT:    vpmovb2m %xmm0, %k0
+; EVEX256-NEXT:    vpmovm2w %k0, %ymm0
+; EVEX256-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; EVEX256-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; EVEX256-NEXT:    vpmovw2m %ymm0, %k0
+; EVEX256-NEXT:    kmovd %k0, %eax
+; EVEX256-NEXT:    # kill: def $ax killed $ax killed $eax
+; EVEX256-NEXT:    vzeroupper
+; EVEX256-NEXT:    retq
+  %1 = shufflevector <16 x i1> %a, <16 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31>
+  %2 = bitcast <16 x i1> %1 to i16
+  ret i16 %2
+}
diff --git a/llvm/test/CodeGen/X86/cmp.ll b/llvm/test/CodeGen/X86/cmp.ll
index 30e52f063075..402da547613c 100644
--- a/llvm/test/CodeGen/X86/cmp.ll
+++ b/llvm/test/CodeGen/X86/cmp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -show-mc-encoding | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ndd -show-mc-encoding | FileCheck --check-prefix=NDD %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NO-NDD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ndd -show-mc-encoding | FileCheck --check-prefixes=CHECK,NDD %s
 
 @d = dso_local global i8 0, align 1
 @d64 = dso_local global i64 0
@@ -17,18 +17,6 @@ define i32 @test1(i32 %X, ptr %y) nounwind {
 ; CHECK-NEXT:  .LBB0_2: # %ReturnBlock
 ; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: test1:
-; NDD:       # %bb.0: # %entry
-; NDD-NEXT:    cmpl $0, (%rsi) # encoding: [0x83,0x3e,0x00]
-; NDD-NEXT:    je .LBB0_2 # encoding: [0x74,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: .LBB0_2-1, kind: FK_PCRel_1
-; NDD-NEXT:  # %bb.1: # %cond_true
-; NDD-NEXT:    movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
-; NDD-NEXT:    retq # encoding: [0xc3]
-; NDD-NEXT:  .LBB0_2: # %ReturnBlock
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
   %tmp = load i32, ptr %y
   %tmp.upgrd.1 = icmp eq i32 %tmp, 0
@@ -54,19 +42,6 @@ define i32 @test2(i32 %X, ptr %y) nounwind {
 ; CHECK-NEXT:  .LBB1_2: # %ReturnBlock
 ; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: test2:
-; NDD:       # %bb.0: # %entry
-; NDD-NEXT:    testl $536870911, (%rsi) # encoding: [0xf7,0x06,0xff,0xff,0xff,0x1f]
-; NDD-NEXT:    # imm = 0x1FFFFFFF
-; NDD-NEXT:    je .LBB1_2 # encoding: [0x74,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
-; NDD-NEXT:  # %bb.1: # %cond_true
-; NDD-NEXT:    movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
-; NDD-NEXT:    retq # encoding: [0xc3]
-; NDD-NEXT:  .LBB1_2: # %ReturnBlock
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
   %tmp = load i32, ptr %y
   %tmp1 = shl i32 %tmp, 3
@@ -92,18 +67,6 @@ define i8 @test2b(i8 %X, ptr %y) nounwind {
 ; CHECK-NEXT:  .LBB2_2: # %ReturnBlock
 ; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: test2b:
-; NDD:       # %bb.0: # %entry
-; NDD-NEXT:    testb $31, (%rsi) # encoding: [0xf6,0x06,0x1f]
-; NDD-NEXT:    je .LBB2_2 # encoding: [0x74,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: .LBB2_2-1, kind: FK_PCRel_1
-; NDD-NEXT:  # %bb.1: # %cond_true
-; NDD-NEXT:    movb $1, %al # encoding: [0xb0,0x01]
-; NDD-NEXT:    retq # encoding: [0xc3]
-; NDD-NEXT:  .LBB2_2: # %ReturnBlock
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
   %tmp = load i8, ptr %y
   %tmp1 = shl i8 %tmp, 3
@@ -124,13 +87,6 @@ define i64 @test3(i64 %x) nounwind {
 ; CHECK-NEXT:    testq %rdi, %rdi # encoding: [0x48,0x85,0xff]
 ; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: test3:
-; NDD:       # %bb.0: # %entry
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    testq %rdi, %rdi # encoding: [0x48,0x85,0xff]
-; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
   %t = icmp eq i64 %x, 0
   %r = zext i1 %t to i64
@@ -144,13 +100,6 @@ define i64 @test4(i64 %x) nounwind {
 ; CHECK-NEXT:    testq %rdi, %rdi # encoding: [0x48,0x85,0xff]
 ; CHECK-NEXT:    setle %al # encoding: [0x0f,0x9e,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: test4:
-; NDD:       # %bb.0:
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    testq %rdi, %rdi # encoding: [0x48,0x85,0xff]
-; NDD-NEXT:    setle %al # encoding: [0x0f,0x9e,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %t = icmp slt i64 %x, 1
   %r = zext i1 %t to i64
   ret i64 %r
@@ -176,26 +125,6 @@ define i32 @test5(double %A) nounwind {
 ; CHECK-NEXT:    jmp foo@PLT # TAILCALL
 ; CHECK-NEXT:    # encoding: [0xeb,A]
 ; CHECK-NEXT:    # fixup A - offset: 1, value: foo@PLT-1, kind: FK_PCRel_1
-;
-; NDD-LABEL: test5:
-; NDD:       # %bb.0: # %entry
-; NDD-NEXT:    ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # encoding: [0x66,0x0f,0x2e,0x05,A,A,A,A]
-; NDD-NEXT:    # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
-; NDD-NEXT:    ja .LBB5_3 # encoding: [0x77,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: .LBB5_3-1, kind: FK_PCRel_1
-; NDD-NEXT:  # %bb.1: # %entry
-; NDD-NEXT:    ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # encoding: [0x66,0x0f,0x2e,0x05,A,A,A,A]
-; NDD-NEXT:    # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
-; NDD-NEXT:    jb .LBB5_3 # encoding: [0x72,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: .LBB5_3-1, kind: FK_PCRel_1
-; NDD-NEXT:  # %bb.2: # %bb12
-; NDD-NEXT:    movl $32, %eax # encoding: [0xb8,0x20,0x00,0x00,0x00]
-; NDD-NEXT:    retq # encoding: [0xc3]
-; NDD-NEXT:  .LBB5_3: # %bb8
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    jmp foo@PLT # TAILCALL
-; NDD-NEXT:    # encoding: [0xeb,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: foo@PLT-1, kind: FK_PCRel_1
 entry:
   %tmp2 = fcmp ogt double %A, 1.500000e+02
   %tmp5 = fcmp ult double %A, 7.500000e+01
@@ -224,18 +153,6 @@ define i32 @test6() nounwind align 2 {
 ; CHECK-NEXT:  .LBB6_1: # %T
 ; CHECK-NEXT:    movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: test6:
-; NDD:       # %bb.0: # %entry
-; NDD-NEXT:    cmpq $0, -{{[0-9]+}}(%rsp) # encoding: [0x48,0x83,0x7c,0x24,0xf8,0x00]
-; NDD-NEXT:    je .LBB6_1 # encoding: [0x74,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: .LBB6_1-1, kind: FK_PCRel_1
-; NDD-NEXT:  # %bb.2: # %F
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
-; NDD-NEXT:  .LBB6_1: # %T
-; NDD-NEXT:    movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
-; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
   %A = alloca { i64, i64 }, align 8
   %B = getelementptr inbounds { i64, i64 }, ptr %A, i64 0, i32 1
@@ -251,12 +168,12 @@ F:
 }
 
 define i32 @test7(i64 %res) nounwind {
-; CHECK-LABEL: test7:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    shrq $32, %rdi # encoding: [0x48,0xc1,0xef,0x20]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: test7:
+; NO-NDD:       # %bb.0: # %entry
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    shrq $32, %rdi # encoding: [0x48,0xc1,0xef,0x20]
+; NO-NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: test7:
 ; NDD:       # %bb.0: # %entry
@@ -271,13 +188,13 @@ entry:
 }
 
 define i32 @test8(i64 %res) nounwind {
-; CHECK-LABEL: test8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrq $32, %rdi # encoding: [0x48,0xc1,0xef,0x20]
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    cmpl $3, %edi # encoding: [0x83,0xff,0x03]
-; CHECK-NEXT:    setb %al # encoding: [0x0f,0x92,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: test8:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    shrq $32, %rdi # encoding: [0x48,0xc1,0xef,0x20]
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    cmpl $3, %edi # encoding: [0x83,0xff,0x03]
+; NO-NDD-NEXT:    setb %al # encoding: [0x0f,0x92,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: test8:
 ; NDD:       # %bb.0:
@@ -292,12 +209,12 @@ define i32 @test8(i64 %res) nounwind {
 }
 
 define i32 @test9(i64 %res) nounwind {
-; CHECK-LABEL: test9:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    shrq $33, %rdi # encoding: [0x48,0xc1,0xef,0x21]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: test9:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    shrq $33, %rdi # encoding: [0x48,0xc1,0xef,0x21]
+; NO-NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: test9:
 ; NDD:       # %bb.0:
@@ -311,12 +228,12 @@ define i32 @test9(i64 %res) nounwind {
 }
 
 define i32 @test10(i64 %res) nounwind {
-; CHECK-LABEL: test10:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    shrq $32, %rdi # encoding: [0x48,0xc1,0xef,0x20]
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: test10:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    shrq $32, %rdi # encoding: [0x48,0xc1,0xef,0x20]
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: test10:
 ; NDD:       # %bb.0:
@@ -330,13 +247,13 @@ define i32 @test10(i64 %res) nounwind {
 }
 
 define i32 @test11(i64 %l) nounwind {
-; CHECK-LABEL: test11:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrq $47, %rdi # encoding: [0x48,0xc1,0xef,0x2f]
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    cmpl $1, %edi # encoding: [0x83,0xff,0x01]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: test11:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    shrq $47, %rdi # encoding: [0x48,0xc1,0xef,0x2f]
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    cmpl $1, %edi # encoding: [0x83,0xff,0x01]
+; NO-NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: test11:
 ; NDD:       # %bb.0:
@@ -372,27 +289,6 @@ define i32 @test12() ssp uwtable {
 ; CHECK-NEXT:    popq %rcx # encoding: [0x59]
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: test12:
-; NDD:       # %bb.0: # %entry
-; NDD-NEXT:    pushq %rax # encoding: [0x50]
-; NDD-NEXT:    .cfi_def_cfa_offset 16
-; NDD-NEXT:    callq test12b@PLT # encoding: [0xe8,A,A,A,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: test12b@PLT-4, kind: FK_PCRel_4
-; NDD-NEXT:    testb %al, %al # encoding: [0x84,0xc0]
-; NDD-NEXT:    je .LBB12_2 # encoding: [0x74,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: .LBB12_2-1, kind: FK_PCRel_1
-; NDD-NEXT:  # %bb.1: # %T
-; NDD-NEXT:    movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
-; NDD-NEXT:    popq %rcx # encoding: [0x59]
-; NDD-NEXT:    .cfi_def_cfa_offset 8
-; NDD-NEXT:    retq # encoding: [0xc3]
-; NDD-NEXT:  .LBB12_2: # %F
-; NDD-NEXT:    .cfi_def_cfa_offset 16
-; NDD-NEXT:    movl $2, %eax # encoding: [0xb8,0x02,0x00,0x00,0x00]
-; NDD-NEXT:    popq %rcx # encoding: [0x59]
-; NDD-NEXT:    .cfi_def_cfa_offset 8
-; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
   %tmp1 = call zeroext i1 @test12b()
   br i1 %tmp1, label %T, label %F
@@ -407,12 +303,12 @@ F:
 declare zeroext i1 @test12b()
 
 define i32 @test13(i32 %mask, i32 %base, i32 %intra) {
-; CHECK-LABEL: test13:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
-; CHECK-NEXT:    testb $8, %dil # encoding: [0x40,0xf6,0xc7,0x08]
-; CHECK-NEXT:    cmovnel %edx, %eax # encoding: [0x0f,0x45,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: test13:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; NO-NDD-NEXT:    testb $8, %dil # encoding: [0x40,0xf6,0xc7,0x08]
+; NO-NDD-NEXT:    cmovnel %edx, %eax # encoding: [0x0f,0x45,0xc2]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: test13:
 ; NDD:       # %bb.0:
@@ -426,12 +322,12 @@ define i32 @test13(i32 %mask, i32 %base, i32 %intra) {
 }
 
 define i32 @test14(i32 %mask, i32 %base, i32 %intra) {
-; CHECK-LABEL: test14:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
-; CHECK-NEXT:    shrl $7, %edi # encoding: [0xc1,0xef,0x07]
-; CHECK-NEXT:    cmovnsl %edx, %eax # encoding: [0x0f,0x49,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: test14:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; NO-NDD-NEXT:    shrl $7, %edi # encoding: [0xc1,0xef,0x07]
+; NO-NDD-NEXT:    cmovnsl %edx, %eax # encoding: [0x0f,0x49,0xc2]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: test14:
 ; NDD:       # %bb.0:
@@ -446,14 +342,14 @@ define i32 @test14(i32 %mask, i32 %base, i32 %intra) {
 
 ; PR19964
 define zeroext i1 @test15(i32 %bf.load, i32 %n) {
-; CHECK-LABEL: test15:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrl $16, %edi # encoding: [0xc1,0xef,0x10]
-; CHECK-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
-; CHECK-NEXT:    cmpl %esi, %edi # encoding: [0x39,0xf7]
-; CHECK-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
-; CHECK-NEXT:    orb %cl, %al # encoding: [0x08,0xc8]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: test15:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    shrl $16, %edi # encoding: [0xc1,0xef,0x10]
+; NO-NDD-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
+; NO-NDD-NEXT:    cmpl %esi, %edi # encoding: [0x39,0xf7]
+; NO-NDD-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
+; NO-NDD-NEXT:    orb %cl, %al # encoding: [0x08,0xc8]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: test15:
 ; NDD:       # %bb.0:
@@ -476,12 +372,6 @@ define i8 @signbit_i16(i16 signext %L) {
 ; CHECK-NEXT:    testw %di, %di # encoding: [0x66,0x85,0xff]
 ; CHECK-NEXT:    setns %al # encoding: [0x0f,0x99,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: signbit_i16:
-; NDD:       # %bb.0:
-; NDD-NEXT:    testw %di, %di # encoding: [0x66,0x85,0xff]
-; NDD-NEXT:    setns %al # encoding: [0x0f,0x99,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %lshr = lshr i16 %L, 15
   %trunc = trunc i16 %lshr to i8
   %not = xor i8 %trunc, 1
@@ -494,12 +384,6 @@ define i8 @signbit_i32(i32 %L) {
 ; CHECK-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
 ; CHECK-NEXT:    setns %al # encoding: [0x0f,0x99,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: signbit_i32:
-; NDD:       # %bb.0:
-; NDD-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
-; NDD-NEXT:    setns %al # encoding: [0x0f,0x99,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %lshr = lshr i32 %L, 31
   %trunc = trunc i32 %lshr to i8
   %not = xor i8 %trunc, 1
@@ -512,12 +396,6 @@ define i8 @signbit_i64(i64 %L) {
 ; CHECK-NEXT:    testq %rdi, %rdi # encoding: [0x48,0x85,0xff]
 ; CHECK-NEXT:    setns %al # encoding: [0x0f,0x99,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: signbit_i64:
-; NDD:       # %bb.0:
-; NDD-NEXT:    testq %rdi, %rdi # encoding: [0x48,0x85,0xff]
-; NDD-NEXT:    setns %al # encoding: [0x0f,0x99,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %lshr = lshr i64 %L, 63
   %trunc = trunc i64 %lshr to i8
   %not = xor i8 %trunc, 1
@@ -530,12 +408,6 @@ define zeroext i1 @signbit_i32_i1(i32 %L) {
 ; CHECK-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
 ; CHECK-NEXT:    setns %al # encoding: [0x0f,0x99,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: signbit_i32_i1:
-; NDD:       # %bb.0:
-; NDD-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
-; NDD-NEXT:    setns %al # encoding: [0x0f,0x99,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %lshr = lshr i32 %L, 31
   %trunc = trunc i32 %lshr to i1
   %not = xor i1 %trunc, true
@@ -544,20 +416,20 @@ define zeroext i1 @signbit_i32_i1(i32 %L) {
 
 ; This test failed due to incorrect handling of "shift + icmp" sequence
 define void @test20(i32 %bf.load, i8 %x1, ptr %b_addr) {
-; CHECK-LABEL: test20:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    testl $16777215, %edi # encoding: [0xf7,0xc7,0xff,0xff,0xff,0x00]
-; CHECK-NEXT:    # imm = 0xFFFFFF
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    movzbl %sil, %ecx # encoding: [0x40,0x0f,0xb6,0xce]
-; CHECK-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
-; CHECK-NEXT:    setne (%rdx) # encoding: [0x0f,0x95,0x02]
-; CHECK-NEXT:    testl $16777215, %edi # encoding: [0xf7,0xc7,0xff,0xff,0xff,0x00]
-; CHECK-NEXT:    # imm = 0xFFFFFF
-; CHECK-NEXT:    setne d(%rip) # encoding: [0x0f,0x95,0x05,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 3, value: d-4, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: test20:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    testl $16777215, %edi # encoding: [0xf7,0xc7,0xff,0xff,0xff,0x00]
+; NO-NDD-NEXT:    # imm = 0xFFFFFF
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    movzbl %sil, %ecx # encoding: [0x40,0x0f,0xb6,0xce]
+; NO-NDD-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
+; NO-NDD-NEXT:    setne (%rdx) # encoding: [0x0f,0x95,0x02]
+; NO-NDD-NEXT:    testl $16777215, %edi # encoding: [0xf7,0xc7,0xff,0xff,0xff,0x00]
+; NO-NDD-NEXT:    # imm = 0xFFFFFF
+; NO-NDD-NEXT:    setne d(%rip) # encoding: [0x0f,0x95,0x05,A,A,A,A]
+; NO-NDD-NEXT:    # fixup A - offset: 3, value: d-4, kind: reloc_riprel_4byte
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: test20:
 ; NDD:       # %bb.0:
@@ -593,11 +465,6 @@ define i32 @highmask_i64_simplify(i64 %val) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: highmask_i64_simplify:
-; NDD:       # %bb.0:
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %and = and i64 %val, -2199023255552
   %cmp = icmp ult i64 %and, 0
   %ret = zext i1 %cmp to i32
@@ -605,12 +472,12 @@ define i32 @highmask_i64_simplify(i64 %val) {
 }
 
 define i32 @highmask_i64_mask64(i64 %val) {
-; CHECK-LABEL: highmask_i64_mask64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    shrq $41, %rdi # encoding: [0x48,0xc1,0xef,0x29]
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: highmask_i64_mask64:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    shrq $41, %rdi # encoding: [0x48,0xc1,0xef,0x29]
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: highmask_i64_mask64:
 ; NDD:       # %bb.0:
@@ -625,14 +492,14 @@ define i32 @highmask_i64_mask64(i64 %val) {
 }
 
 define i64 @highmask_i64_mask64_extra_use(i64 %val) nounwind {
-; CHECK-LABEL: highmask_i64_mask64_extra_use:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    movq %rdi, %rcx # encoding: [0x48,0x89,0xf9]
-; CHECK-NEXT:    shrq $41, %rcx # encoding: [0x48,0xc1,0xe9,0x29]
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    imulq %rdi, %rax # encoding: [0x48,0x0f,0xaf,0xc7]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: highmask_i64_mask64_extra_use:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    movq %rdi, %rcx # encoding: [0x48,0x89,0xf9]
+; NO-NDD-NEXT:    shrq $41, %rcx # encoding: [0x48,0xc1,0xe9,0x29]
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    imulq %rdi, %rax # encoding: [0x48,0x0f,0xaf,0xc7]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: highmask_i64_mask64_extra_use:
 ; NDD:       # %bb.0:
@@ -649,12 +516,12 @@ define i64 @highmask_i64_mask64_extra_use(i64 %val) nounwind {
 }
 
 define i32 @highmask_i64_mask32(i64 %val) {
-; CHECK-LABEL: highmask_i64_mask32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    shrq $20, %rdi # encoding: [0x48,0xc1,0xef,0x14]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: highmask_i64_mask32:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    shrq $20, %rdi # encoding: [0x48,0xc1,0xef,0x14]
+; NO-NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: highmask_i64_mask32:
 ; NDD:       # %bb.0:
@@ -669,14 +536,14 @@ define i32 @highmask_i64_mask32(i64 %val) {
 }
 
 define i64 @highmask_i64_mask32_extra_use(i64 %val) nounwind {
-; CHECK-LABEL: highmask_i64_mask32_extra_use:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    testq $-1048576, %rdi # encoding: [0x48,0xf7,0xc7,0x00,0x00,0xf0,0xff]
-; CHECK-NEXT:    # imm = 0xFFF00000
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    imulq %rdi, %rax # encoding: [0x48,0x0f,0xaf,0xc7]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: highmask_i64_mask32_extra_use:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    testq $-1048576, %rdi # encoding: [0x48,0xf7,0xc7,0x00,0x00,0xf0,0xff]
+; NO-NDD-NEXT:    # imm = 0xFFF00000
+; NO-NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NO-NDD-NEXT:    imulq %rdi, %rax # encoding: [0x48,0x0f,0xaf,0xc7]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: highmask_i64_mask32_extra_use:
 ; NDD:       # %bb.0:
@@ -700,13 +567,6 @@ define i32 @highmask_i64_mask8(i64 %val) {
 ; CHECK-NEXT:    testq $-16, %rdi # encoding: [0x48,0xf7,0xc7,0xf0,0xff,0xff,0xff]
 ; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: highmask_i64_mask8:
-; NDD:       # %bb.0:
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    testq $-16, %rdi # encoding: [0x48,0xf7,0xc7,0xf0,0xff,0xff,0xff]
-; NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %and = and i64 %val, -16
   %cmp = icmp ne i64 %and, 0
   %ret = zext i1 %cmp to i32
@@ -714,12 +574,12 @@ define i32 @highmask_i64_mask8(i64 %val) {
 }
 
 define i32 @lowmask_i64_mask64(i64 %val) {
-; CHECK-LABEL: lowmask_i64_mask64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    shlq $16, %rdi # encoding: [0x48,0xc1,0xe7,0x10]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: lowmask_i64_mask64:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    shlq $16, %rdi # encoding: [0x48,0xc1,0xe7,0x10]
+; NO-NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: lowmask_i64_mask64:
 ; NDD:       # %bb.0:
@@ -734,14 +594,14 @@ define i32 @lowmask_i64_mask64(i64 %val) {
 }
 
 define i64 @lowmask_i64_mask64_extra_use(i64 %val) nounwind {
-; CHECK-LABEL: lowmask_i64_mask64_extra_use:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    movq %rdi, %rcx # encoding: [0x48,0x89,0xf9]
-; CHECK-NEXT:    shlq $16, %rcx # encoding: [0x48,0xc1,0xe1,0x10]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    imulq %rdi, %rax # encoding: [0x48,0x0f,0xaf,0xc7]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: lowmask_i64_mask64_extra_use:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    movq %rdi, %rcx # encoding: [0x48,0x89,0xf9]
+; NO-NDD-NEXT:    shlq $16, %rcx # encoding: [0x48,0xc1,0xe1,0x10]
+; NO-NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NO-NDD-NEXT:    imulq %rdi, %rax # encoding: [0x48,0x0f,0xaf,0xc7]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: lowmask_i64_mask64_extra_use:
 ; NDD:       # %bb.0:
@@ -758,12 +618,12 @@ define i64 @lowmask_i64_mask64_extra_use(i64 %val) nounwind {
 }
 
 define i32 @lowmask_i64_mask32(i64 %val) {
-; CHECK-LABEL: lowmask_i64_mask32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    shlq $44, %rdi # encoding: [0x48,0xc1,0xe7,0x2c]
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: lowmask_i64_mask32:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    shlq $44, %rdi # encoding: [0x48,0xc1,0xe7,0x2c]
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: lowmask_i64_mask32:
 ; NDD:       # %bb.0:
@@ -778,14 +638,14 @@ define i32 @lowmask_i64_mask32(i64 %val) {
 }
 
 define i64 @lowmask_i64_mask32_extra_use(i64 %val) nounwind {
-; CHECK-LABEL: lowmask_i64_mask32_extra_use:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    testl $1048575, %edi # encoding: [0xf7,0xc7,0xff,0xff,0x0f,0x00]
-; CHECK-NEXT:    # imm = 0xFFFFF
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    imulq %rdi, %rax # encoding: [0x48,0x0f,0xaf,0xc7]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: lowmask_i64_mask32_extra_use:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    testl $1048575, %edi # encoding: [0xf7,0xc7,0xff,0xff,0x0f,0x00]
+; NO-NDD-NEXT:    # imm = 0xFFFFF
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    imulq %rdi, %rax # encoding: [0x48,0x0f,0xaf,0xc7]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: lowmask_i64_mask32_extra_use:
 ; NDD:       # %bb.0:
@@ -809,13 +669,6 @@ define i32 @lowmask_i64_mask8(i64 %val) {
 ; CHECK-NEXT:    testb $31, %dil # encoding: [0x40,0xf6,0xc7,0x1f]
 ; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: lowmask_i64_mask8:
-; NDD:       # %bb.0:
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    testb $31, %dil # encoding: [0x40,0xf6,0xc7,0x1f]
-; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %and = and i64 %val, 31
   %cmp = icmp eq i64 %and, 0
   %ret = zext i1 %cmp to i32
@@ -830,14 +683,6 @@ define i32 @highmask_i32_mask32(i32 %val) {
 ; CHECK-NEXT:    # imm = 0xFFF00000
 ; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: highmask_i32_mask32:
-; NDD:       # %bb.0:
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    testl $-1048576, %edi # encoding: [0xf7,0xc7,0x00,0x00,0xf0,0xff]
-; NDD-NEXT:    # imm = 0xFFF00000
-; NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %and = and i32 %val, -1048576
   %cmp = icmp ne i32 %and, 0
   %ret = zext i1 %cmp to i32
@@ -851,13 +696,6 @@ define i32 @highmask_i32_mask8(i32 %val) {
 ; CHECK-NEXT:    testl $-16, %edi # encoding: [0xf7,0xc7,0xf0,0xff,0xff,0xff]
 ; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: highmask_i32_mask8:
-; NDD:       # %bb.0:
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    testl $-16, %edi # encoding: [0xf7,0xc7,0xf0,0xff,0xff,0xff]
-; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %and = and i32 %val, -16
   %cmp = icmp eq i32 %and, 0
   %ret = zext i1 %cmp to i32
@@ -872,14 +710,6 @@ define i32 @lowmask_i32_mask32(i32 %val) {
 ; CHECK-NEXT:    # imm = 0xFFFFF
 ; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: lowmask_i32_mask32:
-; NDD:       # %bb.0:
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    testl $1048575, %edi # encoding: [0xf7,0xc7,0xff,0xff,0x0f,0x00]
-; NDD-NEXT:    # imm = 0xFFFFF
-; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %and = and i32 %val, 1048575
   %cmp = icmp eq i32 %and, 0
   %ret = zext i1 %cmp to i32
@@ -893,13 +723,6 @@ define i32 @lowmask_i32_mask8(i32 %val) {
 ; CHECK-NEXT:    testb $31, %dil # encoding: [0x40,0xf6,0xc7,0x1f]
 ; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: lowmask_i32_mask8:
-; NDD:       # %bb.0:
-; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NDD-NEXT:    testb $31, %dil # encoding: [0x40,0xf6,0xc7,0x1f]
-; NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %and = and i32 %val, 31
   %cmp = icmp ne i32 %and, 0
   %ret = zext i1 %cmp to i32
@@ -907,12 +730,12 @@ define i32 @lowmask_i32_mask8(i32 %val) {
 }
 
 define i1 @shifted_mask64_testb(i64 %a) {
-; CHECK-LABEL: shifted_mask64_testb:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrq $50, %rdi # encoding: [0x48,0xc1,0xef,0x32]
-; CHECK-NEXT:    testb %dil, %dil # encoding: [0x40,0x84,0xff]
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: shifted_mask64_testb:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    shrq $50, %rdi # encoding: [0x48,0xc1,0xef,0x32]
+; NO-NDD-NEXT:    testb %dil, %dil # encoding: [0x40,0x84,0xff]
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: shifted_mask64_testb:
 ; NDD:       # %bb.0:
@@ -926,12 +749,12 @@ define i1 @shifted_mask64_testb(i64 %a) {
 }
 
 define i1 @shifted_mask64_testw(i64 %a) {
-; CHECK-LABEL: shifted_mask64_testw:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrq $33, %rdi # encoding: [0x48,0xc1,0xef,0x21]
-; CHECK-NEXT:    testw %di, %di # encoding: [0x66,0x85,0xff]
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: shifted_mask64_testw:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    shrq $33, %rdi # encoding: [0x48,0xc1,0xef,0x21]
+; NO-NDD-NEXT:    testw %di, %di # encoding: [0x66,0x85,0xff]
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: shifted_mask64_testw:
 ; NDD:       # %bb.0:
@@ -945,12 +768,12 @@ define i1 @shifted_mask64_testw(i64 %a) {
 }
 
 define i1 @shifted_mask64_testl(i64 %a) {
-; CHECK-LABEL: shifted_mask64_testl:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrq $7, %rdi # encoding: [0x48,0xc1,0xef,0x07]
-; CHECK-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: shifted_mask64_testl:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    shrq $7, %rdi # encoding: [0x48,0xc1,0xef,0x07]
+; NO-NDD-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
+; NO-NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: shifted_mask64_testl:
 ; NDD:       # %bb.0:
@@ -964,15 +787,15 @@ define i1 @shifted_mask64_testl(i64 %a) {
 }
 
 define i1 @shifted_mask64_extra_use_const(i64 %a) {
-; CHECK-LABEL: shifted_mask64_extra_use_const:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movabsq $287104476244869120, %rcx # encoding: [0x48,0xb9,0x00,0x00,0x00,0x00,0x00,0x00,0xfc,0x03]
-; CHECK-NEXT:    # imm = 0x3FC000000000000
-; CHECK-NEXT:    testq %rcx, %rdi # encoding: [0x48,0x85,0xcf]
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: shifted_mask64_extra_use_const:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    movabsq $287104476244869120, %rcx # encoding: [0x48,0xb9,0x00,0x00,0x00,0x00,0x00,0x00,0xfc,0x03]
+; NO-NDD-NEXT:    # imm = 0x3FC000000000000
+; NO-NDD-NEXT:    testq %rcx, %rdi # encoding: [0x48,0x85,0xcf]
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NO-NDD-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: shifted_mask64_extra_use_const:
 ; NDD:       # %bb.0:
@@ -990,15 +813,15 @@ define i1 @shifted_mask64_extra_use_const(i64 %a) {
 }
 
 define i1 @shifted_mask64_extra_use_and(i64 %a) {
-; CHECK-LABEL: shifted_mask64_extra_use_and:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movabsq $287104476244869120, %rcx # encoding: [0x48,0xb9,0x00,0x00,0x00,0x00,0x00,0x00,0xfc,0x03]
-; CHECK-NEXT:    # imm = 0x3FC000000000000
-; CHECK-NEXT:    andq %rdi, %rcx # encoding: [0x48,0x21,0xf9]
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: shifted_mask64_extra_use_and:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    movabsq $287104476244869120, %rcx # encoding: [0x48,0xb9,0x00,0x00,0x00,0x00,0x00,0x00,0xfc,0x03]
+; NO-NDD-NEXT:    # imm = 0x3FC000000000000
+; NO-NDD-NEXT:    andq %rdi, %rcx # encoding: [0x48,0x21,0xf9]
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    movq %rcx, d64(%rip) # encoding: [0x48,0x89,0x0d,A,A,A,A]
+; NO-NDD-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: shifted_mask64_extra_use_and:
 ; NDD:       # %bb.0:
@@ -1022,13 +845,6 @@ define i1 @shifted_mask32_testl_immediate(i64 %a) {
 ; CHECK-NEXT:    # imm = 0x3FC0000
 ; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: shifted_mask32_testl_immediate:
-; NDD:       # %bb.0:
-; NDD-NEXT:    testl $66846720, %edi # encoding: [0xf7,0xc7,0x00,0x00,0xfc,0x03]
-; NDD-NEXT:    # imm = 0x3FC0000
-; NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; NDD-NEXT:    retq # encoding: [0xc3]
   %v0 = and i64 %a, 66846720  ; 0xff << 18
   %v1 = icmp ne i64 %v0, 0
   ret i1 %v1
@@ -1044,16 +860,6 @@ define i1 @shifted_mask32_extra_use_const(i64 %a) {
 ; CHECK-NEXT:    # fixup A - offset: 3, value: d64-8, kind: reloc_riprel_4byte
 ; CHECK-NEXT:    # imm = 0x3FC0000
 ; CHECK-NEXT:    retq # encoding: [0xc3]
-;
-; NDD-LABEL: shifted_mask32_extra_use_const:
-; NDD:       # %bb.0:
-; NDD-NEXT:    testl $66846720, %edi # encoding: [0xf7,0xc7,0x00,0x00,0xfc,0x03]
-; NDD-NEXT:    # imm = 0x3FC0000
-; NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; NDD-NEXT:    movq $66846720, d64(%rip) # encoding: [0x48,0xc7,0x05,A,A,A,A,0x00,0x00,0xfc,0x03]
-; NDD-NEXT:    # fixup A - offset: 3, value: d64-8, kind: reloc_riprel_4byte
-; NDD-NEXT:    # imm = 0x3FC0000
-; NDD-NEXT:    retq # encoding: [0xc3]
   %v0 = and i64 %a, 66846720  ; 0xff << 18
   %v1 = icmp ne i64 %v0, 0
   store i64 66846720, ptr @d64
@@ -1061,14 +867,14 @@ define i1 @shifted_mask32_extra_use_const(i64 %a) {
 }
 
 define i1 @shifted_mask32_extra_use_and(i64 %a) {
-; CHECK-LABEL: shifted_mask32_extra_use_and:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    andq $66846720, %rdi # encoding: [0x48,0x81,0xe7,0x00,0x00,0xfc,0x03]
-; CHECK-NEXT:    # imm = 0x3FC0000
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: shifted_mask32_extra_use_and:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    andq $66846720, %rdi # encoding: [0x48,0x81,0xe7,0x00,0x00,0xfc,0x03]
+; NO-NDD-NEXT:    # imm = 0x3FC0000
+; NO-NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NO-NDD-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
+; NO-NDD-NEXT:    # fixup A - offset: 3, value: d64-4, kind: reloc_riprel_4byte
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: shifted_mask32_extra_use_and:
 ; NDD:       # %bb.0:
@@ -1085,14 +891,14 @@ define i1 @shifted_mask32_extra_use_and(i64 %a) {
 }
 
 define { i64, i64 } @pr39968(i64, i64, i32) {
-; CHECK-LABEL: pr39968:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    testb $64, %dl # encoding: [0xf6,0xc2,0x40]
-; CHECK-NEXT:    cmovneq %rdi, %rsi # encoding: [0x48,0x0f,0x45,0xf7]
-; CHECK-NEXT:    cmovneq %rdi, %rax # encoding: [0x48,0x0f,0x45,0xc7]
-; CHECK-NEXT:    movq %rsi, %rdx # encoding: [0x48,0x89,0xf2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: pr39968:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NO-NDD-NEXT:    testb $64, %dl # encoding: [0xf6,0xc2,0x40]
+; NO-NDD-NEXT:    cmovneq %rdi, %rsi # encoding: [0x48,0x0f,0x45,0xf7]
+; NO-NDD-NEXT:    cmovneq %rdi, %rax # encoding: [0x48,0x0f,0x45,0xc7]
+; NO-NDD-NEXT:    movq %rsi, %rdx # encoding: [0x48,0x89,0xf2]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: pr39968:
 ; NDD:       # %bb.0:
@@ -1124,18 +930,6 @@ define i32 @pr42189(i16 signext %c) {
 ; CHECK-NEXT:    jmp g@PLT # TAILCALL
 ; CHECK-NEXT:    # encoding: [0xeb,A]
 ; CHECK-NEXT:    # fixup A - offset: 1, value: g@PLT-1, kind: FK_PCRel_1
-;
-; NDD-LABEL: pr42189:
-; NDD:       # %bb.0: # %entry
-; NDD-NEXT:    cmpl $32767, %edi # encoding: [0x81,0xff,0xff,0x7f,0x00,0x00]
-; NDD-NEXT:    # imm = 0x7FFF
-; NDD-NEXT:    jne f@PLT # TAILCALL
-; NDD-NEXT:    # encoding: [0x75,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: f@PLT-1, kind: FK_PCRel_1
-; NDD-NEXT:  # %bb.1: # %if.then
-; NDD-NEXT:    jmp g@PLT # TAILCALL
-; NDD-NEXT:    # encoding: [0xeb,A]
-; NDD-NEXT:    # fixup A - offset: 1, value: g@PLT-1, kind: FK_PCRel_1
 entry:
   %cmp = icmp eq i16 %c, 32767
   br i1 %cmp, label %if.then, label %if.end
@@ -1160,12 +954,12 @@ declare i32 @f()
 ; The store makes sure the chain result of the load is used which used to
 ; prevent the post isel peephole from catching this.
 define i1 @fold_test_and_with_chain(ptr %x, ptr %y, i32 %z) {
-; CHECK-LABEL: fold_test_and_with_chain:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    testl %edx, (%rdi) # encoding: [0x85,0x17]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    movl %edx, (%rsi) # encoding: [0x89,0x16]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NO-NDD-LABEL: fold_test_and_with_chain:
+; NO-NDD:       # %bb.0:
+; NO-NDD-NEXT:    testl %edx, (%rdi) # encoding: [0x85,0x17]
+; NO-NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NO-NDD-NEXT:    movl %edx, (%rsi) # encoding: [0x89,0x16]
+; NO-NDD-NEXT:    retq # encoding: [0xc3]
 ;
 ; NDD-LABEL: fold_test_and_with_chain:
 ; NDD:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
index 8d2bb77a9e1a..5d7bf4a2c978 100644
--- a/llvm/test/CodeGen/X86/combine-mul.ll
+++ b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -80,13 +80,13 @@ define <4 x i32> @combine_vec_mul_pow2b(<4 x i32> %x) {
 define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) {
 ; SSE-LABEL: combine_vec_mul_pow2c:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    paddq %xmm0, %xmm2
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    psllq $4, %xmm2
 ; SSE-NEXT:    psllq $2, %xmm1
 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    paddq %xmm0, %xmm2
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_mul_pow2c:
@@ -399,14 +399,12 @@ define i64 @combine_mul_self_demandedbits(i64 %x) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    imulq %rdi, %rax
-; SSE-NEXT:    andq $-3, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_mul_self_demandedbits:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movq %rdi, %rax
 ; AVX-NEXT:    imulq %rdi, %rax
-; AVX-NEXT:    andq $-3, %rax
 ; AVX-NEXT:    retq
   %1 = mul i64 %x, %x
   %2 = and i64 %1, -3
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
new file mode 100644
index 000000000000..175d21a4f706
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -0,0 +1,862 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse    | FileCheck %s -check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 | FileCheck %s -check-prefixes=SSE,SSE4
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx  | FileCheck %s -check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=x86-64-v4 | FileCheck %s -check-prefixes=AVX,AVX512
+
+; Verify that each of the following test cases is folded into a single
+; instruction which performs a blend operation.
+
+define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test1:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+
+
+define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test2:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+
+define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test3:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test3:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test3:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+
+
+define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test4:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test4:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test4:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+
+define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test5:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test5:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test5:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+
+define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test6:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test6:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test6:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+
+define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test7:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test7:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test7:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    retq
+  %and1 = and <4 x i32> %a, <i32 -1, i32 -1, i32 0, i32 0>
+  %and2 = and <4 x i32> %b, <i32 0, i32 0, i32 -1, i32 -1>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+
+
+define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test8:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    retq
+  %and1 = and <2 x i64> %a, <i64 -1, i64 0>
+  %and2 = and <2 x i64> %b, <i64 0, i64 -1>
+  %or = or <2 x i64> %and1, %and2
+  ret <2 x i64> %or
+}
+
+
+define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test9:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test9:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test9:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %and1 = and <4 x i32> %a, <i32 0, i32 0, i32 -1, i32 -1>
+  %and2 = and <4 x i32> %b, <i32 -1, i32 -1, i32 0, i32 0>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+
+
+define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test10:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test10:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test10:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %and1 = and <2 x i64> %a, <i64 0, i64 -1>
+  %and2 = and <2 x i64> %b, <i64 -1, i64 0>
+  %or = or <2 x i64> %and1, %and2
+  ret <2 x i64> %or
+}
+
+
+define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test11:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test11:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test11:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    retq
+  %and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0>
+  %and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 -1>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+
+
+define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test12:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test12:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test12:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    retq
+  %and1 = and <4 x i32> %a, <i32 0, i32 -1, i32 -1, i32 -1>
+  %and2 = and <4 x i32> %b, <i32 -1, i32 0, i32 0, i32 0>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+
+
+; Verify that the following test cases are folded into single shuffles.
+
+define <4 x i32> @test13(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: test13:
+; SSE:       # %bb.0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test13:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 1, i32 1, i32 4, i32 4>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+
+define <2 x i64> @test14(<2 x i64> %a, <2 x i64> %b) {
+; SSE-LABEL: test14:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test14:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+
+
+define <4 x i32> @test15(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: test15:
+; SSE:       # %bb.0:
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,1],xmm0[2,1]
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test15:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[2,1],xmm0[2,1]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 1>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 2, i32 1, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+
+define <2 x i64> @test16(<2 x i64> %a, <2 x i64> %b) {
+; SSE-LABEL: test16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+
+
+; Verify that the dag-combiner does not fold a OR of two shuffles into a single
+; shuffle instruction when the shuffle indexes are not compatible.
+
+define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: test17:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psllq $32, %xmm0
+; SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test17:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX-NEXT:    vmovq {{.*#+}} xmm1 = xmm1[0],zero
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 2>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+
+define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test18:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    xorps %xmm3, %xmm3
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,0,1,1]
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT:    orps %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test18:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm2, %xmm2
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; SSE4-NEXT:    por %xmm0, %xmm2
+; SSE4-NEXT:    movdqa %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test18:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test18:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX2-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test18:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+
+define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test19:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,2]
+; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test19:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,3]
+; SSE4-NEXT:    pxor %xmm3, %xmm3
+; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    por %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test19:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0,2,3]
+; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2]
+; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test19:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0,2,3]
+; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
+; AVX2-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test19:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[12,13,14,15]
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,zero,zero,zero,xmm1[8,9,10,11,8,9,10,11]
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 2, i32 2>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+
+define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) {
+; SSE-LABEL: test20:
+; SSE:       # %bb.0:
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test20:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+
+
+define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
+; SSE-LABEL: test21:
+; SSE:       # %bb.0:
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test21:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test21:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test21:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vorpd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; AVX512-NEXT:    retq
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+
+
+; Verify that the dag-combiner keeps the correct domain for float/double vectors
+; bitcast to use the mask-or blend combine.
+
+define <2 x double> @test22(<2 x double> %a0, <2 x double> %a1) {
+; SSE2-LABEL: test22:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test22:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test22:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %bc1 = bitcast <2 x double> %a0 to <2 x i64>
+  %bc2 = bitcast <2 x double> %a1 to <2 x i64>
+  %and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
+  %and2 = and <2 x i64> %bc2, <i64 -1, i64 0>
+  %or = or <2 x i64> %and1, %and2
+  %bc3 = bitcast <2 x i64> %or to <2 x double>
+  ret <2 x double> %bc3
+}
+
+
+define <4 x float> @test23(<4 x float> %a0, <4 x float> %a1) {
+; SSE2-LABEL: test23:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test23:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test23:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; AVX-NEXT:    retq
+  %bc1 = bitcast <4 x float> %a0 to <4 x i32>
+  %bc2 = bitcast <4 x float> %a1 to <4 x i32>
+  %and1 = and <4 x i32> %bc1, <i32 0, i32 -1, i32 -1, i32 0>
+  %and2 = and <4 x i32> %bc2, <i32 -1, i32 0, i32 0, i32 -1>
+  %or = or <4 x i32> %and1, %and2
+  %bc3 = bitcast <4 x i32> %or to <4 x float>
+  ret <4 x float> %bc3
+}
+
+
+define <4 x float> @test24(<4 x float> %a0, <4 x float> %a1) {
+; SSE2-LABEL: test24:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test24:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test24:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %bc1 = bitcast <4 x float> %a0 to <2 x i64>
+  %bc2 = bitcast <4 x float> %a1 to <2 x i64>
+  %and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
+  %and2 = and <2 x i64> %bc2, <i64 -1, i64 0>
+  %or = or <2 x i64> %and1, %and2
+  %bc3 = bitcast <2 x i64> %or to <4 x float>
+  ret <4 x float> %bc3
+}
+
+
+define <4 x float> @test25(<4 x float> %a0) {
+; SSE2-LABEL: test25:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],mem[0,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test25:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = mem[0],xmm0[1,2],mem[3]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test25:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = mem[0],xmm0[1,2],mem[3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test25:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test25:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; AVX512-NEXT:    retq
+  %bc1 = bitcast <4 x float> %a0 to <4 x i32>
+  %bc2 = bitcast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0> to <4 x i32>
+  %and1 = and <4 x i32> %bc1, <i32 0, i32 -1, i32 -1, i32 0>
+  %and2 = and <4 x i32> %bc2, <i32 -1, i32 0, i32 0, i32 -1>
+  %or = or <4 x i32> %and1, %and2
+  %bc3 = bitcast <4 x i32> %or to <4 x float>
+  ret <4 x float> %bc3
+}
+
+
+; Verify that the DAGCombiner doesn't crash in the attempt to check if a shuffle
+; with illegal type has a legal mask. Method 'isShuffleMaskLegal' only knows how to
+; handle legal vector value types.
+define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
+; SSE2-LABEL: test_crash:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
+; SSE2-NEXT:    andps %xmm2, %xmm1
+; SSE2-NEXT:    andnps %xmm0, %xmm2
+; SSE2-NEXT:    orps %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_crash:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_crash:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %shuf2 = shufflevector <4 x i8> %b, <4 x i8> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %or = or <4 x i8> %shuf1, %shuf2
+  ret <4 x i8> %or
+}
+
+; Verify that we can fold regardless of which operand is the zeroinitializer
+
+define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test2b:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test2b:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test2b:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32><i32 0, i32 0, i32 6, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test2c:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test2c:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test2c:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32><i32 0, i32 0, i32 6, i32 7>
+  %shuf2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %b, <4 x i32><i32 4, i32 5, i32 0, i32 0>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+
+define <4 x i32> @test2d(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test2d:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test2d:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test2d:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %shuf2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %b, <4 x i32><i32 4, i32 5, i32 0, i32 0>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+; Make sure we can have an undef where an index pointing to the zero vector should be
+
+define <4 x i32> @test2e(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test2e:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test2e:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test2e:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 undef, i32 4, i32 2, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @test2f(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test2f:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test2f:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test2f:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 undef, i32 1, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+; (or (and X, c1), c2) -> (and (or X, c2), c1|c2) iff (c1 & c2) != 0
+
+define <2 x i64> @or_and_v2i64(<2 x i64> %a0) {
+; SSE-LABEL: or_and_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: or_and_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: or_and_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: or_and_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [7,7]
+; AVX512-NEXT:    vpternlogq $200, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %1 = and <2 x i64> %a0, <i64 7, i64 7>
+  %2 = or <2 x i64> %1, <i64 3, i64 3>
+  ret <2 x i64> %2
+}
+
+define <4 x i32> @or_and_v4i32(<4 x i32> %a0) {
+; SSE-LABEL: or_and_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: or_and_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: or_and_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: or_and_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,3,15,7]
+; AVX512-NEXT:    vpternlogd $200, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %1 = and <4 x i32> %a0, <i32 1, i32 3, i32 5, i32 7>
+  %2 = or <4 x i32> %1, <i32 3, i32 2, i32 15, i32 2>
+  ret <4 x i32> %2
+}
+
+; If all masked bits are going to be set, that's a constant fold.
+
+define <4 x i32> @or_and_v4i32_fold(<4 x i32> %a0) {
+; SSE-LABEL: or_and_v4i32_fold:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [3,3,3,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: or_and_v4i32_fold:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [3,3,3,3]
+; AVX-NEXT:    retq
+  %1 = and <4 x i32> %a0, <i32 1, i32 1, i32 1, i32 1>
+  %2 = or <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %2
+}
diff --git a/llvm/test/CodeGen/X86/combine-or.ll b/llvm/test/CodeGen/X86/combine-or.ll
index 5188de543f72..3b2102f46a29 100644
--- a/llvm/test/CodeGen/X86/combine-or.ll
+++ b/llvm/test/CodeGen/X86/combine-or.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s -check-prefixes=CHECK,CHECK-LV
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -early-live-intervals | FileCheck %s -check-prefixes=CHECK,CHECK-LIS
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s -check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -early-live-intervals | FileCheck %s -check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s -check-prefixes=CHECK,AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=x86-64-v3 | FileCheck %s -check-prefixes=CHECK,AVX,AVX2
 
 define i32 @or_self(i32 %x) {
 ; CHECK-LABEL: or_self:
@@ -19,472 +21,34 @@ define <4 x i32> @or_self_vec(<4 x i32> %x) {
   ret <4 x i32> %or
 }
 
-; Verify that each of the following test cases is folded into a single
-; instruction which performs a blend operation.
-
-define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
-  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
-  %or = or <2 x i64> %shuf1, %shuf2
-  ret <2 x i64> %or
-}
-
-
-define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-
-define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test3:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
-  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
-  %or = or <2 x i64> %shuf1, %shuf2
-  ret <2 x i64> %or
-}
-
-
-define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test4:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-
-define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test5:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-
-define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-
-define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test7:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; CHECK-NEXT:    retq
-  %and1 = and <4 x i32> %a, <i32 -1, i32 -1, i32 0, i32 0>
-  %and2 = and <4 x i32> %b, <i32 0, i32 0, i32 -1, i32 -1>
-  %or = or <4 x i32> %and1, %and2
-  ret <4 x i32> %or
-}
-
-
-define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; CHECK-NEXT:    retq
-  %and1 = and <2 x i64> %a, <i64 -1, i64 0>
-  %and2 = and <2 x i64> %b, <i64 0, i64 -1>
-  %or = or <2 x i64> %and1, %and2
-  ret <2 x i64> %or
-}
-
-
-define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test9:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %and1 = and <4 x i32> %a, <i32 0, i32 0, i32 -1, i32 -1>
-  %and2 = and <4 x i32> %b, <i32 -1, i32 -1, i32 0, i32 0>
-  %or = or <4 x i32> %and1, %and2
-  ret <4 x i32> %or
-}
-
-
-define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test10:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %and1 = and <2 x i64> %a, <i64 0, i64 -1>
-  %and2 = and <2 x i64> %b, <i64 -1, i64 0>
-  %or = or <2 x i64> %and1, %and2
-  ret <2 x i64> %or
-}
-
-
-define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test11:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; CHECK-NEXT:    retq
-  %and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0>
-  %and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 -1>
-  %or = or <4 x i32> %and1, %and2
-  ret <4 x i32> %or
-}
-
-
-define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test12:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; CHECK-NEXT:    retq
-  %and1 = and <4 x i32> %a, <i32 0, i32 -1, i32 -1, i32 -1>
-  %and2 = and <4 x i32> %b, <i32 -1, i32 0, i32 0, i32 0>
-  %or = or <4 x i32> %and1, %and2
-  ret <4 x i32> %or
-}
-
-
-; Verify that the following test cases are folded into single shuffles.
-
-define <4 x i32> @test13(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test13:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 1, i32 1, i32 4, i32 4>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-
-define <2 x i64> @test14(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test14:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
-  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
-  %or = or <2 x i64> %shuf1, %shuf2
-  ret <2 x i64> %or
-}
-
-
-define <4 x i32> @test15(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test15:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,1],xmm0[2,1]
-; CHECK-NEXT:    movaps %xmm1, %xmm0
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 1>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 2, i32 1, i32 4, i32 4>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-
-define <2 x i64> @test16(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; CHECK-NEXT:    movaps %xmm1, %xmm0
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
-  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
-  %or = or <2 x i64> %shuf1, %shuf2
-  ret <2 x i64> %or
-}
-
-
-; Verify that the dag-combiner does not fold a OR of two shuffles into a single
-; shuffle instruction when the shuffle indexes are not compatible.
-
-define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test17:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    psllq $32, %xmm0
-; CHECK-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 2>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-
-define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test18:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pxor %xmm2, %xmm2
-; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; CHECK-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm0
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-
-define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test19:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,3]
-; CHECK-NEXT:    pxor %xmm3, %xmm3
-; CHECK-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
-; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7]
-; CHECK-NEXT:    por %xmm2, %xmm0
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 3>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 2, i32 2>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-
-define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test20:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
-  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
-  %or = or <2 x i64> %shuf1, %shuf2
-  ret <2 x i64> %or
-}
-
-
-define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test21:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
-  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
-  %or = or <2 x i64> %shuf1, %shuf2
-  ret <2 x i64> %or
-}
-
-
-; Verify that the dag-combiner keeps the correct domain for float/double vectors
-; bitcast to use the mask-or blend combine.
-
-define <2 x double> @test22(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test22:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %bc1 = bitcast <2 x double> %a0 to <2 x i64>
-  %bc2 = bitcast <2 x double> %a1 to <2 x i64>
-  %and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
-  %and2 = and <2 x i64> %bc2, <i64 -1, i64 0>
-  %or = or <2 x i64> %and1, %and2
-  %bc3 = bitcast <2 x i64> %or to <2 x double>
-  ret <2 x double> %bc3
-}
-
-
-define <4 x float> @test23(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test23:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
-; CHECK-NEXT:    retq
-  %bc1 = bitcast <4 x float> %a0 to <4 x i32>
-  %bc2 = bitcast <4 x float> %a1 to <4 x i32>
-  %and1 = and <4 x i32> %bc1, <i32 0, i32 -1, i32 -1, i32 0>
-  %and2 = and <4 x i32> %bc2, <i32 -1, i32 0, i32 0, i32 -1>
-  %or = or <4 x i32> %and1, %and2
-  %bc3 = bitcast <4 x i32> %or to <4 x float>
-  ret <4 x float> %bc3
-}
-
-
-define <4 x float> @test24(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test24:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %bc1 = bitcast <4 x float> %a0 to <2 x i64>
-  %bc2 = bitcast <4 x float> %a1 to <2 x i64>
-  %and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
-  %and2 = and <2 x i64> %bc2, <i64 -1, i64 0>
-  %or = or <2 x i64> %and1, %and2
-  %bc3 = bitcast <2 x i64> %or to <4 x float>
-  ret <4 x float> %bc3
-}
-
-
-define <4 x float> @test25(<4 x float> %a0) {
-; CHECK-LABEL: test25:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = mem[0],xmm0[1,2],mem[3]
-; CHECK-NEXT:    retq
-  %bc1 = bitcast <4 x float> %a0 to <4 x i32>
-  %bc2 = bitcast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0> to <4 x i32>
-  %and1 = and <4 x i32> %bc1, <i32 0, i32 -1, i32 -1, i32 0>
-  %and2 = and <4 x i32> %bc2, <i32 -1, i32 0, i32 0, i32 -1>
-  %or = or <4 x i32> %and1, %and2
-  %bc3 = bitcast <4 x i32> %or to <4 x float>
-  ret <4 x float> %bc3
-}
-
-
-; Verify that the DAGCombiner doesn't crash in the attempt to check if a shuffle
-; with illegal type has a legal mask. Method 'isShuffleMaskLegal' only knows how to
-; handle legal vector value types.
-define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
-; CHECK-LABEL: test_crash:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
-  %shuf2 = shufflevector <4 x i8> %b, <4 x i8> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
-  %or = or <4 x i8> %shuf1, %shuf2
-  ret <4 x i8> %or
-}
-
-; Verify that we can fold regardless of which operand is the zeroinitializer
-
-define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test2b:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32><i32 0, i32 0, i32 6, i32 7>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test2c:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32><i32 0, i32 0, i32 6, i32 7>
-  %shuf2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %b, <4 x i32><i32 4, i32 5, i32 0, i32 0>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-
-define <4 x i32> @test2d(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test2d:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
-  %shuf2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %b, <4 x i32><i32 4, i32 5, i32 0, i32 0>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-; Make sure we can have an undef where an index pointing to the zero vector should be
-
-define <4 x i32> @test2e(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test2e:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 undef, i32 4, i32 2, i32 3>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 0, i32 1, i32 4, i32 4>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-define <4 x i32> @test2f(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test2f:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    retq
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 4, i32 4, i32 2, i32 3>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 undef, i32 1, i32 4, i32 4>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-
-; (or (and X, c1), c2) -> (and (or X, c2), c1|c2) iff (c1 & c2) != 0
-
-define <2 x i64> @or_and_v2i64(<2 x i64> %a0) {
-; CHECK-LABEL: or_and_v2i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    retq
-  %1 = and <2 x i64> %a0, <i64 7, i64 7>
-  %2 = or <2 x i64> %1, <i64 3, i64 3>
-  ret <2 x i64> %2
-}
-
-define <4 x i32> @or_and_v4i32(<4 x i32> %a0) {
-; CHECK-LABEL: or_and_v4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    retq
-  %1 = and <4 x i32> %a0, <i32 1, i32 3, i32 5, i32 7>
-  %2 = or <4 x i32> %1, <i32 3, i32 2, i32 15, i32 2>
-  ret <4 x i32> %2
-}
-
-; If all masked bits are going to be set, that's a constant fold.
-
-define <4 x i32> @or_and_v4i32_fold(<4 x i32> %a0) {
-; CHECK-LABEL: or_and_v4i32_fold:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [3,3,3,3]
-; CHECK-NEXT:    retq
-  %1 = and <4 x i32> %a0, <i32 1, i32 1, i32 1, i32 1>
-  %2 = or <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
-  ret <4 x i32> %2
-}
-
 ; fold (or x, c) -> c iff (x & ~c) == 0
 
 define <2 x i64> @or_zext_v2i32(<2 x i32> %a0) {
-; CHECK-LABEL: or_zext_v2i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,4294967295]
-; CHECK-NEXT:    retq
+; SSE-LABEL: or_zext_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,4294967295]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: or_zext_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = [4294967295,4294967295]
+; AVX-NEXT:    # xmm0 = mem[0,0]
+; AVX-NEXT:    retq
   %1 = zext <2 x i32> %a0 to <2 x i64>
   %2 = or <2 x i64> %1, <i64 4294967295, i64 4294967295>
   ret <2 x i64> %2
 }
 
 define <4 x i32> @or_zext_v4i16(<4 x i16> %a0) {
-; CHECK-LABEL: or_zext_v4i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [65535,65535,65535,65535]
-; CHECK-NEXT:    retq
+; SSE-LABEL: or_zext_v4i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [65535,65535,65535,65535]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: or_zext_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [65535,65535,65535,65535]
+; AVX-NEXT:    retq
   %1 = zext <4 x i16> %a0 to <4 x i32>
   %2 = or <4 x i32> %1, <i32 65535, i32 65535, i32 65535, i32 65535>
   ret <4 x i32> %2
@@ -522,12 +86,19 @@ define i64 @or_and_and_commute_i64(i64 %x, i64 %y) {
 }
 
 define <4 x i32> @or_and_and_v4i32(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: or_and_and_v4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    orps %xmm1, %xmm0
-; CHECK-NEXT:    retq
+; SSE-LABEL: or_and_and_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    orps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: or_and_and_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %xy = or <4 x i32> %x, %y
   %mx = and <4 x i32> %x, <i32 2, i32 4, i32 8, i32 16>
   %mxy = and <4 x i32> %xy, <i32 1, i32 -1, i32 -5, i32 -25>
@@ -611,7 +182,106 @@ define i32 @or_and_multiuse_and_multiuse_i32(i32 %x, i32 %y) nounwind {
   ret i32 %r
 }
 
+define i64 @or_build_pair_not(i32 %a0, i32 %a1) {
+; CHECK-LABEL: or_build_pair_not:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT:    shlq $32, %rsi
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    orq %rsi, %rax
+; CHECK-NEXT:    notq %rax
+; CHECK-NEXT:    retq
+  %n0 = xor i32 %a0, -1
+  %n1 = xor i32 %a1, -1
+  %x0 = zext i32 %n0 to i64
+  %x1 = zext i32 %n1 to i64
+  %hi = shl i64 %x1, 32
+  %r = or i64 %hi, %x0
+  ret i64 %r
+}
+
+define i64 @PR89533(<64 x i8> %a0) {
+; SSE-LABEL: PR89533:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95]
+; SSE-NEXT:    pcmpeqb %xmm4, %xmm0
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    xorl $65535, %eax # imm = 0xFFFF
+; SSE-NEXT:    pcmpeqb %xmm4, %xmm1
+; SSE-NEXT:    pmovmskb %xmm1, %ecx
+; SSE-NEXT:    notl %ecx
+; SSE-NEXT:    shll $16, %ecx
+; SSE-NEXT:    orl %eax, %ecx
+; SSE-NEXT:    pcmpeqb %xmm4, %xmm2
+; SSE-NEXT:    pmovmskb %xmm2, %edx
+; SSE-NEXT:    xorl $65535, %edx # imm = 0xFFFF
+; SSE-NEXT:    pcmpeqb %xmm4, %xmm3
+; SSE-NEXT:    pmovmskb %xmm3, %eax
+; SSE-NEXT:    notl %eax
+; SSE-NEXT:    shll $16, %eax
+; SSE-NEXT:    orl %edx, %eax
+; SSE-NEXT:    shlq $32, %rax
+; SSE-NEXT:    orq %rcx, %rax
+; SSE-NEXT:    je .LBB11_2
+; SSE-NEXT:  # %bb.1: # %cond.false
+; SSE-NEXT:    rep bsfq %rax, %rax
+; SSE-NEXT:    retq
+; SSE-NEXT:  .LBB11_2: # %cond.end
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: PR89533:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95]
+; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpmovmskb %xmm3, %eax
+; AVX1-NEXT:    xorl $65535, %eax # imm = 0xFFFF
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %ecx
+; AVX1-NEXT:    notl %ecx
+; AVX1-NEXT:    shll $16, %ecx
+; AVX1-NEXT:    orl %eax, %ecx
+; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %edx
+; AVX1-NEXT:    xorl $65535, %edx # imm = 0xFFFF
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovmskb %xmm0, %eax
+; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    shll $16, %eax
+; AVX1-NEXT:    orl %edx, %eax
+; AVX1-NEXT:    shlq $32, %rax
+; AVX1-NEXT:    orq %rcx, %rax
+; AVX1-NEXT:    je .LBB11_2
+; AVX1-NEXT:  # %bb.1: # %cond.false
+; AVX1-NEXT:    rep bsfq %rax, %rax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+; AVX1-NEXT:  .LBB11_2: # %cond.end
+; AVX1-NEXT:    movl $64, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: PR89533:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95,95]
+; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm0
+; AVX2-NEXT:    vpmovmskb %ymm0, %ecx
+; AVX2-NEXT:    shlq $32, %rcx
+; AVX2-NEXT:    orq %rax, %rcx
+; AVX2-NEXT:    notq %rcx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %rcx, %rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %cmp = icmp ne <64 x i8> %a0, <i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95, i8 95>
+  %mask = bitcast <64 x i1> %cmp to i64
+  %tz = tail call i64 @llvm.cttz.i64(i64 %mask, i1 false)
+  ret i64 %tz
+}
+
 declare void @use_i32(i32)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-LIS: {{.*}}
-; CHECK-LV: {{.*}}
+
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index e12ca56023a7..33cc8e96f663 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -182,101 +182,101 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    sarl $31, %ebx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    xorl %ecx, %esi
 ; X86-NEXT:    movl %esi, %ebp
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl %eax, %esi
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl %ebx, %esi
 ; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    subl %edi, %ebp
-; X86-NEXT:    sbbl %edi, %ebx
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    subl %ebx, %ebp
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %edx
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movl %ebp, %ecx
 ; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl (%esp), %edx # 4-byte Folded Reload
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl %edi, %ecx
+; X86-NEXT:    bsrl %ebx, %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    addl $32, %ecx
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    bsrl %ebx, %edx
+; X86-NEXT:    bsrl %edi, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    bsrl %ebp, %ebp
 ; X86-NEXT:    xorl $31, %ebp
 ; X86-NEXT:    addl $32, %ebp
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %edx, %ebp
 ; X86-NEXT:    addl $64, %ebp
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    orl %esi, %ebx
 ; X86-NEXT:    cmovnel %ecx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    bsrl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    bsrl %eax, %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    addl $32, %ecx
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    bsrl %ebx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    bsrl %edi, %esi
 ; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    bsrl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    addl $32, %edx
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %esi, %edx
 ; X86-NEXT:    addl $64, %edx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
 ; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    subl %edx, %ebp
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
 ; X86-NEXT:    movl $0, %eax
@@ -284,40 +284,40 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl $127, %ecx
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    cmpl %ebp, %ecx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    sbbl %edi, %ecx
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    cmovnel %esi, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    cmovnel %esi, %edx
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    cmovnel %esi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    cmovnel %esi, %eax
-; X86-NEXT:    cmovel (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    jne .LBB4_8
 ; X86-NEXT:  # %bb.1: # %_udiv-special-cases
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    xorl $127, %ebx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    xorl $127, %edi
+; X86-NEXT:    orl %ebp, %edi
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    orl %edi, %ecx
 ; X86-NEXT:    je .LBB4_8
 ; X86-NEXT:  # %bb.2: # %udiv-bb1
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -332,234 +332,233 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $15, %al
 ; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %ebx
-; X86-NEXT:    movl 144(%esp,%ebx), %edx
-; X86-NEXT:    movl 148(%esp,%ebx), %edi
+; X86-NEXT:    movsbl %al, %edi
+; X86-NEXT:    movl 144(%esp,%edi), %edx
+; X86-NEXT:    movl 148(%esp,%edi), %esi
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shldl %cl, %edx, %edi
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    notb %cl
-; X86-NEXT:    movl 140(%esp,%ebx), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    shrl %esi
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    movl 136(%esp,%ebx), %esi
+; X86-NEXT:    movl 140(%esp,%edi), %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    shrl %ebx
+; X86-NEXT:    shrl %cl, %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    movl 136(%esp,%edi), %edx
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    jae .LBB4_3
 ; X86-NEXT:  # %bb.6:
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    jmp .LBB4_7
 ; X86-NEXT:  .LBB4_3: # %udiv-preheader
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movb %bl, %ch
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movb %dl, %ch
 ; X86-NEXT:    andb $7, %ch
-; X86-NEXT:    movb %bl, %cl
+; X86-NEXT:    movb %dl, %cl
 ; X86-NEXT:    shrb $3, %cl
 ; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movzbl %cl, %ebp
-; X86-NEXT:    movl 100(%esp,%ebp), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 96(%esp,%ebp), %ebx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %esi, %edx
-; X86-NEXT:    movl 88(%esp,%ebp), %ebp
-; X86-NEXT:    movl 92(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    notb %cl
-; X86-NEXT:    addl %ebx, %ebx
-; X86-NEXT:    shll %cl, %ebx
-; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    movzbl %cl, %edx
+; X86-NEXT:    movl 100(%esp,%edx), %esi
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl 96(%esp,%edx), %edi
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %ebp
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    shrdl %cl, %esi, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 88(%esp,%edx), %ebx
+; X86-NEXT:    movl 92(%esp,%edx), %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    addl %edi, %edi
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movb %ch, %cl
+; X86-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    addl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB4_4: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %ebx
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    shldl $1, %ebp, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebp, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    shldl $1, %ebp, %edx
-; X86-NEXT:    shldl $1, %edi, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    shldl $1, %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shldl $1, %eax, %ecx
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    orl %ebx, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    orl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    sbbl %ebp, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl %edi, %esi
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    subl %ecx, %ebp
-; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    subl %ecx, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %esi
-; X86-NEXT:    adcl $-1, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    orl %edi, %ecx
 ; X86-NEXT:    jne .LBB4_4
 ; X86-NEXT:  # %bb.5:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
-; X86-NEXT:    shldl $1, %edx, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    shldl $1, %eax, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    shldl $1, %ebx, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    shldl $1, %eax, %ebx
+; X86-NEXT:    orl %ecx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    shldl $1, %esi, %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    addl %esi, %esi
-; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    orl %edi, %esi
 ; X86-NEXT:  .LBB4_8: # %udiv-end
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, %edi
 ; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    xorl %ecx, %ebx
 ; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    xorl %ecx, %esi
 ; X86-NEXT:    subl %ecx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %ecx, %ebx
 ; X86-NEXT:    sbbl %ecx, %edx
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %esi, (%ebp)
-; X86-NEXT:    movl %eax, 4(%ebp)
-; X86-NEXT:    movl %edx, 8(%ebp)
-; X86-NEXT:    movl %edi, 12(%ebp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %esi, (%ecx)
+; X86-NEXT:    movl %eax, 4(%ecx)
+; X86-NEXT:    movl %ebx, 8(%ecx)
+; X86-NEXT:    movl %edx, 12(%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    imull %esi, %ecx
@@ -568,12 +567,12 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    addl %edx, %ebp
 ; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    subl (%esp), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
diff --git a/llvm/test/CodeGen/X86/fold-masked-merge.ll b/llvm/test/CodeGen/X86/fold-masked-merge.ll
index 135494ac25f8..b2614c5fe049 100644
--- a/llvm/test/CodeGen/X86/fold-masked-merge.ll
+++ b/llvm/test/CodeGen/X86/fold-masked-merge.ll
@@ -56,9 +56,7 @@ define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
 ; NOBMI-LABEL: masked_merge2:
 ; NOBMI:       # %bb.0:
 ; NOBMI-NEXT:    movl %esi, %eax
-; NOBMI-NEXT:    xorb %sil, %al
-; NOBMI-NEXT:    andb %dil, %al
-; NOBMI-NEXT:    xorb %sil, %al
+; NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; NOBMI-NEXT:    retq
 ;
 ; BMI-LABEL: masked_merge2:
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index b212e9438e1b..c79da37988e4 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -459,8 +459,7 @@ define i32 @freeze_ashr(i32 %a0) nounwind {
 ; X64-LABEL: freeze_ashr:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    sarl $3, %eax
-; X64-NEXT:    sarl $3, %eax
+; X64-NEXT:    sarl $6, %eax
 ; X64-NEXT:    retq
   %x = ashr i32 %a0, 3
   %y = freeze i32 %x
@@ -531,30 +530,12 @@ define i32 @freeze_ashr_outofrange(i32 %a0) nounwind {
 define <8 x i16> @freeze_ashr_vec(<8 x i16> %a0) nounwind {
 ; X86-LABEL: freeze_ashr_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movdqa %xmm0, %xmm2
-; X86-NEXT:    psraw $1, %xmm2
-; X86-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
-; X86-NEXT:    movdqa %xmm1, %xmm3
-; X86-NEXT:    pandn %xmm2, %xmm3
-; X86-NEXT:    psraw $3, %xmm0
-; X86-NEXT:    pand %xmm1, %xmm0
-; X86-NEXT:    por %xmm3, %xmm0
-; X86-NEXT:    movdqa %xmm0, %xmm2
-; X86-NEXT:    psraw $3, %xmm2
-; X86-NEXT:    psraw $1, %xmm0
-; X86-NEXT:    pand %xmm1, %xmm0
-; X86-NEXT:    pandn %xmm2, %xmm1
-; X86-NEXT:    por %xmm1, %xmm0
+; X86-NEXT:    psraw $4, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_ashr_vec:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpsraw $1, %xmm0, %xmm1
-; X64-NEXT:    vpsraw $3, %xmm0, %xmm0
-; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; X64-NEXT:    vpsraw $3, %xmm0, %xmm1
-; X64-NEXT:    vpsraw $1, %xmm0, %xmm0
-; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; X64-NEXT:    vpsraw $4, %xmm0, %xmm0
 ; X64-NEXT:    retq
   %x = ashr <8 x i16> %a0, <i16 3, i16 1, i16 3, i16 1, i16 3, i16 1, i16 3, i16 1>
   %y = freeze <8 x i16> %x
@@ -592,8 +573,7 @@ define i32 @freeze_lshr(i32 %a0) nounwind {
 ; X64-LABEL: freeze_lshr:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrl $2, %eax
-; X64-NEXT:    shrl %eax
+; X64-NEXT:    shrl $3, %eax
 ; X64-NEXT:    retq
   %x = lshr i32 %a0, 2
   %y = freeze i32 %x
@@ -664,30 +644,12 @@ define i32 @freeze_lshr_outofrange(i32 %a0) nounwind {
 define <8 x i16> @freeze_lshr_vec(<8 x i16> %a0) nounwind {
 ; X86-LABEL: freeze_lshr_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movdqa %xmm0, %xmm2
-; X86-NEXT:    psrlw $1, %xmm2
-; X86-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
-; X86-NEXT:    movdqa %xmm1, %xmm3
-; X86-NEXT:    pandn %xmm2, %xmm3
-; X86-NEXT:    psrlw $2, %xmm0
-; X86-NEXT:    pand %xmm1, %xmm0
-; X86-NEXT:    por %xmm3, %xmm0
-; X86-NEXT:    movdqa %xmm0, %xmm2
-; X86-NEXT:    psrlw $2, %xmm2
-; X86-NEXT:    psrlw $1, %xmm0
-; X86-NEXT:    pand %xmm1, %xmm0
-; X86-NEXT:    pandn %xmm2, %xmm1
-; X86-NEXT:    por %xmm1, %xmm0
+; X86-NEXT:    psrlw $3, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_lshr_vec:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpsrlw $1, %xmm0, %xmm1
-; X64-NEXT:    vpsrlw $2, %xmm0, %xmm0
-; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; X64-NEXT:    vpsrlw $2, %xmm0, %xmm1
-; X64-NEXT:    vpsrlw $1, %xmm0, %xmm0
-; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; X64-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; X64-NEXT:    retq
   %x = lshr <8 x i16> %a0, <i16 2, i16 1, i16 2, i16 1, i16 2, i16 1, i16 2, i16 1>
   %y = freeze <8 x i16> %x
diff --git a/llvm/test/CodeGen/X86/freeze-combine.ll b/llvm/test/CodeGen/X86/freeze-combine.ll
index b037a6d9a1b9..1cfb8627a4dd 100644
--- a/llvm/test/CodeGen/X86/freeze-combine.ll
+++ b/llvm/test/CodeGen/X86/freeze-combine.ll
@@ -3,9 +3,9 @@
 define i32 @const() {
   ; CHECK-LABEL: name: const
   ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK:   [[MOV32ri:%[0-9]+]]:gr32 = MOV32ri 1
-  ; CHECK:   $eax = COPY [[MOV32ri]]
-  ; CHECK:   RET 0, $eax
+  ; CHECK-NEXT:   [[MOV32ri:%[0-9]+]]:gr32 = MOV32ri 1
+  ; CHECK-NEXT:   $eax = COPY [[MOV32ri]]
+  ; CHECK-NEXT:   RET 0, $eax
   %y = freeze i32 1
   ret i32 %y
 }
@@ -13,11 +13,11 @@ define i32 @const() {
 define i32 @fold(i32 %x) {
   ; CHECK-LABEL: name: fold
   ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK:   liveins: $edi
-  ; CHECK:   [[COPY:%[0-9]+]]:gr32 = COPY $edi
-  ; CHECK:   [[COPY1:%[0-9]+]]:gr32 = COPY [[COPY]]
-  ; CHECK:   $eax = COPY [[COPY1]]
-  ; CHECK:   RET 0, $eax
+  ; CHECK-NEXT:   liveins: $edi
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr32 = COPY $edi
+  ; CHECK-NEXT:   $eax = COPY [[COPY]]
+  ; CHECK-NEXT:   RET 0, $eax
   %y = freeze i32 %x
   %z = freeze i32 %y
   ret i32 %z
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index ee7f4aea02c0..fe240286462e 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -672,3 +672,23 @@ define void @pr59677(i32 %x, ptr %out) nounwind {
   ret void
 }
 declare <4 x float> @llvm.sin.v4f32(<4 x float>)
+
+; Test that we can eliminate freeze by changing the BUILD_VECTOR to a splat
+; zero vector.
+define void @freeze_buildvector_not_simple_type(ptr %dst) nounwind {
+; X86-LABEL: freeze_buildvector_not_simple_type:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb $0, 4(%eax)
+; X86-NEXT:    movl $0, (%eax)
+; X86-NEXT:    retl
+;
+; X64-LABEL: freeze_buildvector_not_simple_type:
+; X64:       # %bb.0:
+; X64-NEXT:    movb $0, 4(%rdi)
+; X64-NEXT:    movl $0, (%rdi)
+; X64-NEXT:    retq
+  %i0 = freeze <5 x i8> <i8 poison, i8 0, i8 0, i8 undef, i8 0>
+  store <5 x i8> %i0, ptr %dst
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
index 0c341dc63a9e..afe0ebb9dcb4 100644
--- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
@@ -522,17 +522,17 @@ declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
 define <16 x i8> @splatconstant_fshr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; GFNISSE-LABEL: splatconstant_fshr_v16i8:
 ; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    psrlw $7, %xmm1
 ; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; GFNISSE-NEXT:    paddb %xmm0, %xmm0
 ; GFNISSE-NEXT:    por %xmm1, %xmm0
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1OR2-LABEL: splatconstant_fshr_v16i8:
 ; GFNIAVX1OR2:       # %bb.0:
+; GFNIAVX1OR2-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    vpsrlw $7, %xmm1, %xmm1
 ; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; GFNIAVX1OR2-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll
index 7ab8300b269a..96aff5b2af31 100644
--- a/llvm/test/CodeGen/X86/gfni-rotates.ll
+++ b/llvm/test/CodeGen/X86/gfni-rotates.ll
@@ -421,18 +421,18 @@ define <16 x i8> @splatconstant_rotr_v16i8(<16 x i8> %a) nounwind {
 ; GFNISSE-LABEL: splatconstant_rotr_v16i8:
 ; GFNISSE:       # %bb.0:
 ; GFNISSE-NEXT:    movdqa %xmm0, %xmm1
-; GFNISSE-NEXT:    psrlw $7, %xmm1
-; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; GFNISSE-NEXT:    paddb %xmm0, %xmm0
+; GFNISSE-NEXT:    paddb %xmm0, %xmm1
+; GFNISSE-NEXT:    psrlw $7, %xmm0
+; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; GFNISSE-NEXT:    por %xmm1, %xmm0
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1OR2-LABEL: splatconstant_rotr_v16i8:
 ; GFNIAVX1OR2:       # %bb.0:
-; GFNIAVX1OR2-NEXT:    vpsrlw $7, %xmm0, %xmm1
-; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; GFNIAVX1OR2-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
+; GFNIAVX1OR2-NEXT:    vpsrlw $7, %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; GFNIAVX1OR2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: splatconstant_rotr_v16i8:
diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll
index 39d02f9112f4..2f780e3c6fe1 100644
--- a/llvm/test/CodeGen/X86/known-never-zero.ll
+++ b/llvm/test/CodeGen/X86/known-never-zero.ll
@@ -676,12 +676,13 @@ define i32 @rotr_known_nonzero(i32 %xx, i32 %y) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
 ; X64-NEXT:    orl $256, %edi # imm = 0x100
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    rorl %cl, %edi
+; X64-NEXT:    rorl %cl, %eax
 ; X64-NEXT:    testl %edi, %edi
 ; X64-NEXT:    je .LBB22_1
 ; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    rep bsfl %eax, %eax
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB22_1:
 ; X64-NEXT:    movl $32, %eax
@@ -713,12 +714,13 @@ define i32 @rotr_maybe_zero(i32 %x, i32 %y) {
 ; X64-LABEL: rotr_maybe_zero:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    rorl %cl, %edi
+; X64-NEXT:    rorl %cl, %eax
 ; X64-NEXT:    testl %edi, %edi
 ; X64-NEXT:    je .LBB23_1
 ; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    rep bsfl %eax, %eax
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB23_1:
 ; X64-NEXT:    movl $32, %eax
@@ -773,12 +775,13 @@ define i32 @rotr_with_fshr_maybe_zero(i32 %x, i32 %y) {
 ; X64-LABEL: rotr_with_fshr_maybe_zero:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    rorl %cl, %edi
+; X64-NEXT:    rorl %cl, %eax
 ; X64-NEXT:    testl %edi, %edi
 ; X64-NEXT:    je .LBB25_1
 ; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    rep bsfl %eax, %eax
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB25_1:
 ; X64-NEXT:    movl $32, %eax
@@ -808,12 +811,13 @@ define i32 @rotl_known_nonzero(i32 %xx, i32 %y) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
 ; X64-NEXT:    orl $256, %edi # imm = 0x100
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    roll %cl, %edi
+; X64-NEXT:    roll %cl, %eax
 ; X64-NEXT:    testl %edi, %edi
 ; X64-NEXT:    je .LBB26_1
 ; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    rep bsfl %eax, %eax
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB26_1:
 ; X64-NEXT:    movl $32, %eax
@@ -845,12 +849,13 @@ define i32 @rotl_maybe_zero(i32 %x, i32 %y) {
 ; X64-LABEL: rotl_maybe_zero:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    roll %cl, %edi
+; X64-NEXT:    roll %cl, %eax
 ; X64-NEXT:    testl %edi, %edi
 ; X64-NEXT:    je .LBB27_1
 ; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    rep bsfl %eax, %eax
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB27_1:
 ; X64-NEXT:    movl $32, %eax
@@ -905,12 +910,13 @@ define i32 @rotl_with_fshl_maybe_zero(i32 %x, i32 %y) {
 ; X64-LABEL: rotl_with_fshl_maybe_zero:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    roll %cl, %edi
+; X64-NEXT:    roll %cl, %eax
 ; X64-NEXT:    testl %edi, %edi
 ; X64-NEXT:    je .LBB29_1
 ; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    rep bsfl %eax, %eax
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB29_1:
 ; X64-NEXT:    movl $32, %eax
diff --git a/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll
index ae1320f8b086..200a8184d4bd 100644
--- a/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefixes=X86,X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
diff --git a/llvm/test/CodeGen/X86/memcmp-minsize.ll b/llvm/test/CodeGen/X86/memcmp-minsize.ll
index 544d1c49f26b..9c20f3e0cdef 100644
--- a/llvm/test/CodeGen/X86/memcmp-minsize.ll
+++ b/llvm/test/CodeGen/X86/memcmp-minsize.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64,X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
index 762691151f4b..3db6ae8b76b2 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefixes=X86,X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll
index c0c7b98d471c..edd61641ad2a 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64,X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
index cb45fd3ebb90..1c301da26bea 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefixes=X86,X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll
index 720344a22e43..1ee3317b9c96 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64,X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
index d3cced3233ea..5a6375e08bca 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
@@ -893,27 +893,26 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm6
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE41-NEXT:    pand %xmm5, %xmm0
-; SSE41-NEXT:    por %xmm4, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    psubq %xmm1, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm3 = [1,1]
+; SSE41-NEXT:    por %xmm0, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    psubq %xmm1, %xmm4
 ; SSE41-NEXT:    psubq %xmm2, %xmm1
-; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    psrlq $1, %xmm0
 ; SSE41-NEXT:    psrlq $33, %xmm1
-; SSE41-NEXT:    pmuludq %xmm6, %xmm1
-; SSE41-NEXT:    movdqa %xmm6, %xmm3
-; SSE41-NEXT:    psrlq $32, %xmm3
-; SSE41-NEXT:    pmuludq %xmm0, %xmm3
-; SSE41-NEXT:    paddq %xmm1, %xmm3
-; SSE41-NEXT:    psllq $32, %xmm3
-; SSE41-NEXT:    pmuludq %xmm6, %xmm0
+; SSE41-NEXT:    pmuludq %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psrlq $32, %xmm4
+; SSE41-NEXT:    pmuludq %xmm0, %xmm4
+; SSE41-NEXT:    paddq %xmm1, %xmm4
+; SSE41-NEXT:    psllq $32, %xmm4
+; SSE41-NEXT:    pmuludq %xmm3, %xmm0
 ; SSE41-NEXT:    paddq %xmm2, %xmm0
-; SSE41-NEXT:    paddq %xmm3, %xmm0
+; SSE41-NEXT:    paddq %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: vec128_i64_signed_reg_reg:
@@ -1077,27 +1076,26 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm6
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE41-NEXT:    pand %xmm5, %xmm0
-; SSE41-NEXT:    por %xmm4, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    psubq %xmm1, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm3, %xmm0
+; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm3 = [1,1]
+; SSE41-NEXT:    por %xmm0, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    psubq %xmm1, %xmm4
 ; SSE41-NEXT:    psubq %xmm2, %xmm1
-; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    psrlq $1, %xmm0
 ; SSE41-NEXT:    psrlq $33, %xmm1
-; SSE41-NEXT:    pmuludq %xmm6, %xmm1
-; SSE41-NEXT:    movdqa %xmm6, %xmm3
-; SSE41-NEXT:    psrlq $32, %xmm3
-; SSE41-NEXT:    pmuludq %xmm0, %xmm3
-; SSE41-NEXT:    paddq %xmm1, %xmm3
-; SSE41-NEXT:    psllq $32, %xmm3
-; SSE41-NEXT:    pmuludq %xmm6, %xmm0
+; SSE41-NEXT:    pmuludq %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    psrlq $32, %xmm4
+; SSE41-NEXT:    pmuludq %xmm0, %xmm4
+; SSE41-NEXT:    paddq %xmm1, %xmm4
+; SSE41-NEXT:    psllq $32, %xmm4
+; SSE41-NEXT:    pmuludq %xmm3, %xmm0
 ; SSE41-NEXT:    paddq %xmm2, %xmm0
-; SSE41-NEXT:    paddq %xmm3, %xmm0
+; SSE41-NEXT:    paddq %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: vec128_i64_unsigned_reg_reg:
@@ -1993,14 +1991,14 @@ define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) noun
 ;
 ; AVX512VL-FALLBACK-LABEL: vec128_i16_unsigned_reg_reg:
 ; AVX512VL-FALLBACK:       # %bb.0:
-; AVX512VL-FALLBACK-NEXT:    vpminuw %xmm1, %xmm0, %xmm2
-; AVX512VL-FALLBACK-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $15, %xmm2, %xmm2, %xmm2
-; AVX512VL-FALLBACK-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
+; AVX512VL-FALLBACK-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm2
+; AVX512VL-FALLBACK-NEXT:    vpminuw %xmm1, %xmm0, %xmm1
+; AVX512VL-FALLBACK-NEXT:    vpsubw %xmm1, %xmm2, %xmm2
+; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
+; AVX512VL-FALLBACK-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogq $15, %xmm1, %xmm1, %xmm1
+; AVX512VL-FALLBACK-NEXT:    vpxor %xmm1, %xmm2, %xmm2
+; AVX512VL-FALLBACK-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
 ;
@@ -2786,14 +2784,14 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
 ;
 ; AVX512VL-FALLBACK-LABEL: vec128_i8_unsigned_reg_reg:
 ; AVX512VL-FALLBACK:       # %bb.0:
-; AVX512VL-FALLBACK-NEXT:    vpminub %xmm1, %xmm0, %xmm2
-; AVX512VL-FALLBACK-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $15, %xmm2, %xmm2, %xmm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
+; AVX512VL-FALLBACK-NEXT:    vpmaxub %xmm1, %xmm0, %xmm2
+; AVX512VL-FALLBACK-NEXT:    vpminub %xmm1, %xmm0, %xmm1
+; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
+; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
+; AVX512VL-FALLBACK-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogq $15, %xmm1, %xmm1, %xmm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index cc08396ae8c7..e880a1acc9e8 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -1445,14 +1445,14 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n
 ;
 ; AVX512VL-FALLBACK-LABEL: vec256_i16_unsigned_reg_reg:
 ; AVX512VL-FALLBACK:       # %bb.0:
-; AVX512VL-FALLBACK-NEXT:    vpminuw %ymm1, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm2, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $15, %ymm2, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpxor %ymm2, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm2, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpminuw %ymm1, %ymm0, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogq $15, %ymm1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpxor %ymm1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm1, %ymm2, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    retq
 ;
@@ -2210,14 +2210,14 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
 ;
 ; AVX512VL-FALLBACK-LABEL: vec256_i8_unsigned_reg_reg:
 ; AVX512VL-FALLBACK:       # %bb.0:
-; AVX512VL-FALLBACK-NEXT:    vpminub %ymm1, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $15, %ymm2, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpminub %ymm1, %ymm0, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogq $15, %ymm1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm2, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
index 2fdf6ef224ca..366dad1612b4 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
@@ -684,22 +684,21 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin
 ; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm4
 ; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm5
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
-; AVX512F-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpminsb %ymm1, %ymm0, %ymm5
 ; AVX512F-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
+; AVX512F-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT:    vpandq %zmm6, %zmm5, %zmm5
-; AVX512F-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT:    vpsubb %ymm2, %ymm7, %ymm2
-; AVX512F-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
+; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm1, %ymm6, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm1
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
@@ -715,22 +714,21 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm4
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm1, %ymm0, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT:    vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpand %ymm5, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512VL-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT:    vpandq %zmm6, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm7, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm6, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512VL-FALLBACK-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm1
 ; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
@@ -772,20 +770,19 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
 ; AVX512F-NEXT:    vpminub %ymm1, %ymm0, %ymm6
 ; AVX512F-NEXT:    vpcmpeqb %ymm6, %ymm0, %ymm7
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm5
-; AVX512F-NEXT:    vpmaxub %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpmaxub %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsubb %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT:    vpmaxub %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm4
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT:    vpandq %zmm6, %zmm4, %zmm4
-; AVX512F-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT:    vpsubb %ymm2, %ymm7, %ymm2
-; AVX512F-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
+; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm1, %ymm6, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpternlogq $184, %zmm4, %zmm5, %zmm1
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
@@ -803,20 +800,19 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
 ; AVX512VL-FALLBACK-NEXT:    vpminub %ymm1, %ymm0, %ymm6
 ; AVX512VL-FALLBACK-NEXT:    vpcmpeqb %ymm6, %ymm0, %ymm7
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm5
-; AVX512VL-FALLBACK-NEXT:    vpmaxub %ymm2, %ymm3, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpmaxub %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpand %ymm4, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm4
-; AVX512VL-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT:    vpandq %zmm6, %zmm4, %zmm4
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm7, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm6, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512VL-FALLBACK-NEXT:    vpternlogq $184, %zmm4, %zmm5, %zmm1
 ; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
diff --git a/llvm/test/CodeGen/X86/named-vector-shuffle-reverse.ll b/llvm/test/CodeGen/X86/named-vector-shuffle-reverse.ll
index f0917be88744..2a5e834f0ac7 100644
--- a/llvm/test/CodeGen/X86/named-vector-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/X86/named-vector-shuffle-reverse.ll
@@ -23,7 +23,7 @@ define <16 x i8> @reverse_v16i8(<16 x i8> %a) #0 {
 ; CHECK-NEXT:    packuswb %xmm2, %xmm0
 ; CHECK-NEXT:    retq
 
-  %res = call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> %a)
+  %res = call <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8> %a)
   ret <16 x i8> %res
 }
 
@@ -34,7 +34,7 @@ define <8 x i16> @reverse_v8i16(<8 x i16> %a) #0 {
 ; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
 ; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
 ; CHECK-NEXT:    retq
-  %res = call <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16> %a)
+  %res = call <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16> %a)
   ret <8 x i16> %res
 }
 
@@ -43,7 +43,7 @@ define <4 x i32> @reverse_v4i32(<4 x i32> %a) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
 ; CHECK-NEXT:    retq
-  %res = call <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32> %a)
+  %res = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> %a)
   ret <4 x i32> %res
 }
 
@@ -52,7 +52,7 @@ define <2 x i64> @reverse_v2i64(<2 x i64> %a) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; CHECK-NEXT:    retq
-  %res = call <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64> %a)
+  %res = call <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64> %a)
   ret <2 x i64> %res
 }
 
@@ -61,7 +61,7 @@ define <4 x float> @reverse_v4f32(<4 x float> %a) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
 ; CHECK-NEXT:    retq
-  %res = call <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float> %a)
+  %res = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> %a)
   ret <4 x float> %res
 }
 
@@ -70,7 +70,7 @@ define <2 x double> @reverse_v2f64(<2 x double> %a) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; CHECK-NEXT:    retq
-  %res = call <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double> %a)
+  %res = call <2 x double> @llvm.vector.reverse.v2f64(<2 x double> %a)
   ret <2 x double> %res
 }
 
@@ -83,7 +83,7 @@ define <2 x i8> @reverse_v2i8(<2 x i8> %a) #0 {
 ; CHECK-NEXT:    psllw $8, %xmm0
 ; CHECK-NEXT:    por %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-  %res = call <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8> %a)
+  %res = call <2 x i8> @llvm.vector.reverse.v2i8(<2 x i8> %a)
   ret <2 x i8> %res
 }
 
@@ -95,7 +95,7 @@ define <8 x i32> @reverse_v8i32(<8 x i32> %a) #0 {
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,2,1,0]
 ; CHECK-NEXT:    movdqa %xmm2, %xmm0
 ; CHECK-NEXT:    retq
-  %res = call <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32> %a)
+  %res = call <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32> %a)
   ret <8 x i32> %res
 }
 
@@ -115,20 +115,20 @@ define <16 x float> @reverse_v16f32(<16 x float> %a) #0 {
 ; CHECK-NEXT:    movaps %xmm5, %xmm3
 ; CHECK-NEXT:    retq
 
-  %res = call <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float> %a)
+  %res = call <16 x float> @llvm.vector.reverse.v16f32(<16 x float> %a)
   ret <16 x float> %res
 }
 
 
-declare <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8>)
-declare <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8>)
-declare <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16>)
-declare <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32>)
-declare <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32>)
-declare <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64>)
-declare <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half>)
-declare <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float>)
-declare <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float>)
-declare <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double>)
+declare <2 x i8> @llvm.vector.reverse.v2i8(<2 x i8>)
+declare <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8>)
+declare <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16>)
+declare <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32>)
+declare <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32>)
+declare <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64>)
+declare <8 x half> @llvm.vector.reverse.v8f16(<8 x half>)
+declare <4 x float> @llvm.vector.reverse.v4f32(<4 x float>)
+declare <16 x float> @llvm.vector.reverse.v16f32(<16 x float>)
+declare <2 x double> @llvm.vector.reverse.v2f64(<2 x double>)
 
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index 04aff9b7d2e5..ace78b38d53e 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -22,7 +22,7 @@ define void @f() nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $176, %esp
+; X86-NEXT:    subl $160, %esp
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -47,55 +47,54 @@ define void @f() nounwind {
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    jne .LBB0_1
 ; X86-NEXT:  # %bb.2: # %BB_udiv-special-cases
-; X86-NEXT:    bsrl %esi, %ecx
-; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    bsrl %esi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    addl $32, %eax
 ; X86-NEXT:    jmp .LBB0_3
 ; X86-NEXT:  .LBB0_1:
-; X86-NEXT:    bsrl %edi, %ecx
-; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:  .LBB0_3: # %BB_udiv-special-cases
-; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    jne .LBB0_4
 ; X86-NEXT:  # %bb.5: # %BB_udiv-special-cases
-; X86-NEXT:    addl $64, %ecx
+; X86-NEXT:    addl $64, %eax
 ; X86-NEXT:    jmp .LBB0_6
 ; X86-NEXT:  .LBB0_4:
-; X86-NEXT:    bsrl %edx, %ecx
-; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    bsrl %edx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    addl $32, %eax
 ; X86-NEXT:  .LBB0_6: # %BB_udiv-special-cases
-; X86-NEXT:    subl $62, %ecx
+; X86-NEXT:    subl $62, %eax
 ; X86-NEXT:    movl $0, %ebx
 ; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    addl $-66, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    addl $-66, %eax
 ; X86-NEXT:    adcl $-1, %ebx
-; X86-NEXT:    adcl $3, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movb $1, %al
-; X86-NEXT:    testb %al, %al
+; X86-NEXT:    adcl $3, %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movb $1, %cl
+; X86-NEXT:    testb %cl, %cl
 ; X86-NEXT:    jne .LBB0_11
 ; X86-NEXT:  # %bb.7: # %BB_udiv-special-cases
-; X86-NEXT:    andl $3, %edi
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    xorl $65, %eax
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    andl $3, %esi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    xorl $65, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    orl %ebx, %ecx
 ; X86-NEXT:    je .LBB0_11
 ; X86-NEXT:  # %bb.8: # %udiv-bb1
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl $1, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    andl $3, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    andl $3, %ebx
 ; X86-NEXT:    movb $65, %cl
 ; X86-NEXT:    subb %al, %cl
 ; X86-NEXT:    movb %cl, %ch
@@ -112,29 +111,31 @@ define void @f() nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 136(%esp,%eax), %edx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 120(%esp,%eax), %edi
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    shll %cl, %edi
 ; X86-NEXT:    notb %cl
-; X86-NEXT:    movl 128(%esp,%eax), %edi
-; X86-NEXT:    movl 132(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl 112(%esp,%eax), %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 116(%esp,%eax), %edx
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    je .LBB0_11
 ; X86-NEXT:  # %bb.9: # %udiv-preheader
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    andl $3, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -149,20 +150,20 @@ define void @f() nounwind {
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $15, %al
-; X86-NEXT:    movzbl %al, %esi
-; X86-NEXT:    movl 80(%esp,%esi), %edx
-; X86-NEXT:    movl 84(%esp,%esi), %eax
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl 64(%esp,%eax), %edi
+; X86-NEXT:    movl 68(%esp,%eax), %edx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrl %cl, %edi
+; X86-NEXT:    shrl %cl, %esi
 ; X86-NEXT:    notb %cl
-; X86-NEXT:    movl 88(%esp,%esi), %esi
-; X86-NEXT:    addl %esi, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl 72(%esp,%eax), %ebx
+; X86-NEXT:    addl %ebx, %ebx
+; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    orl %esi, %ebx
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl $-1, %eax
@@ -179,63 +180,62 @@ define void @f() nounwind {
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB0_10: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $1, %esi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    shldl $1, %ebx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    andl $2, %eax
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    leal (%eax,%edx,2), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %ebx
+; X86-NEXT:    shldl $1, %ebx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    andl $2, %edx
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    leal (%edx,%ebx,2), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $3, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    sbbl %ecx, %ebx
-; X86-NEXT:    shll $30, %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    sarl $30, %eax
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    shrdl $1, %ebx, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    andl $1, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $3, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %esi
+; X86-NEXT:    shll $30, %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sarl $30, %edx
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    shrdl $1, %esi, %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    subl %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    andl $3, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl $-1, %eax
-; X86-NEXT:    adcl $-1, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $3, %edi
-; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $3, %esi
+; X86-NEXT:    andl $3, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    jne .LBB0_10
 ; X86-NEXT:  .LBB0_11: # %udiv-end
 ; X86-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll
index 782c84408f25..1b13cee628df 100644
--- a/llvm/test/CodeGen/X86/pr62286.ll
+++ b/llvm/test/CodeGen/X86/pr62286.ll
@@ -8,21 +8,20 @@ define i64 @PR62286(i32 %a) {
 ; SSE-LABEL: PR62286:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movd %edi, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,0]
-; SSE-NEXT:    paddd %xmm1, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,1,0]
+; SSE-NEXT:    paddd %xmm0, %xmm0
 ; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT:    pxor %xmm3, %xmm3
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm3
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,0]
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE-NEXT:    paddq %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE-NEXT:    paddq %xmm0, %xmm1
-; SSE-NEXT:    movq %xmm1, %rax
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    paddq %xmm1, %xmm0
+; SSE-NEXT:    movq %xmm0, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: PR62286:
@@ -47,10 +46,10 @@ define i64 @PR62286(i32 %a) {
 ; AVX2-LABEL: PR62286:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovd %edi, %xmm0
-; AVX2-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm1
+; AVX2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
index a9f3e8b22fb6..785b97d8c240 100644
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -14,7 +14,6 @@ define i256 @test1(i256 %a) nounwind {
 ; ILP:       # %bb.0:
 ; ILP-NEXT:    movq %rdi, %rax
 ; ILP-NEXT:    leal (%rsi,%rsi), %ecx
-; ILP-NEXT:    addb $3, %cl
 ; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
@@ -23,6 +22,7 @@ define i256 @test1(i256 %a) nounwind {
 ; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
 ; ILP-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT:    addb $3, %cl
 ; ILP-NEXT:    movl %ecx, %edx
 ; ILP-NEXT:    shrb $3, %dl
 ; ILP-NEXT:    andb $7, %cl
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index 31297a06f809..a1cabb433d87 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -563,18 +563,20 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    subq $120, %rsp
 ; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
-; X64-NEXT:    psllq $32, %xmm3
+; X64-NEXT:    pxor %xmm3, %xmm3
+; X64-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
 ; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
 ; X64-NEXT:    psrad $31, %xmm2
 ; X64-NEXT:    psrlq $31, %xmm3
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
 ; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT:    movq %xmm0, %rbx
-; X64-NEXT:    movq %rbx, %r13
-; X64-NEXT:    sarq $63, %r13
-; X64-NEXT:    shldq $31, %rbx, %r13
+; X64-NEXT:    movq %xmm0, %rbp
+; X64-NEXT:    movq %rbp, %r14
+; X64-NEXT:    sarq $63, %r14
+; X64-NEXT:    shldq $31, %rbp, %r14
+; X64-NEXT:    movq %rbp, %r15
+; X64-NEXT:    shlq $31, %r15
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; X64-NEXT:    pxor %xmm0, %xmm0
 ; X64-NEXT:    pcmpgtd %xmm1, %xmm0
@@ -582,113 +584,113 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    movq %xmm1, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    sarq $63, %r15
-; X64-NEXT:    movq %rbx, %r12
-; X64-NEXT:    shlq $31, %r12
-; X64-NEXT:    movq %r12, %rdi
-; X64-NEXT:    movq %r13, %rsi
-; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    sarq $63, %rbx
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    movq %rbx, %rcx
 ; X64-NEXT:    callq __divti3@PLT
-; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    subq $1, %rbp
-; X64-NEXT:    sbbq $0, %r14
-; X64-NEXT:    shrq $63, %rbx
-; X64-NEXT:    xorl %r15d, %ebx
-; X64-NEXT:    movq %r12, %rdi
-; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    subq $1, %r13
+; X64-NEXT:    sbbq $0, %r12
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    movq %r14, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    movq %rbx, %rcx
 ; X64-NEXT:    callq __modti3@PLT
 ; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    setne %al
+; X64-NEXT:    shrq $63, %rbp
+; X64-NEXT:    xorl %ebp, %ebx
 ; X64-NEXT:    testb %bl, %al
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
 ; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    movl $4294967295, %edx # imm = 0xFFFFFFFF
-; X64-NEXT:    cmpq %rdx, %rbp
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    cmpq %rdx, %r13
+; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    sbbq $0, %rax
-; X64-NEXT:    cmovgeq %rcx, %r14
-; X64-NEXT:    cmovgeq %rdx, %rbp
+; X64-NEXT:    cmovgeq %rdx, %r13
+; X64-NEXT:    cmovgeq %rcx, %r12
 ; X64-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT:    cmpq %rbp, %rcx
+; X64-NEXT:    cmpq %r13, %rcx
 ; X64-NEXT:    movq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; X64-NEXT:    movq $-1, %rax
-; X64-NEXT:    sbbq %r14, %rax
-; X64-NEXT:    cmovgeq %rcx, %rbp
-; X64-NEXT:    movq %rbp, %xmm0
+; X64-NEXT:    sbbq %r12, %rax
+; X64-NEXT:    cmovgeq %rcx, %r13
+; X64-NEXT:    movq %r13, %xmm0
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; X64-NEXT:    # xmm0 = mem[2,3,2,3]
-; X64-NEXT:    movq %xmm0, %rbx
-; X64-NEXT:    movq %rbx, %r13
-; X64-NEXT:    sarq $63, %r13
-; X64-NEXT:    shldq $31, %rbx, %r13
+; X64-NEXT:    movq %xmm0, %rbp
+; X64-NEXT:    movq %rbp, %r14
+; X64-NEXT:    sarq $63, %r14
+; X64-NEXT:    shldq $31, %rbp, %r14
+; X64-NEXT:    movq %rbp, %r15
+; X64-NEXT:    shlq $31, %r15
 ; X64-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; X64-NEXT:    # xmm0 = mem[2,3,2,3]
 ; X64-NEXT:    movq %xmm0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    sarq $63, %r15
-; X64-NEXT:    movq %rbx, %r12
-; X64-NEXT:    shlq $31, %r12
-; X64-NEXT:    movq %r12, %rdi
-; X64-NEXT:    movq %r13, %rsi
-; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    sarq $63, %rbx
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    movq %rbx, %rcx
 ; X64-NEXT:    callq __divti3@PLT
-; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    subq $1, %rbp
-; X64-NEXT:    sbbq $0, %r14
-; X64-NEXT:    shrq $63, %rbx
-; X64-NEXT:    xorl %r15d, %ebx
-; X64-NEXT:    movq %r12, %rdi
-; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    subq $1, %r13
+; X64-NEXT:    sbbq $0, %r12
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    movq %r14, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    movq %rbx, %rcx
 ; X64-NEXT:    callq __modti3@PLT
 ; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    setne %al
+; X64-NEXT:    shrq $63, %rbp
+; X64-NEXT:    xorl %ebp, %ebx
 ; X64-NEXT:    testb %bl, %al
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
 ; X64-NEXT:    movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT:    cmpq %rcx, %rbp
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    cmpq %rcx, %r13
+; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    sbbq $0, %rax
+; X64-NEXT:    cmovgeq %rcx, %r13
 ; X64-NEXT:    movl $0, %eax
-; X64-NEXT:    cmovgeq %rax, %r14
-; X64-NEXT:    cmovgeq %rcx, %rbp
+; X64-NEXT:    cmovgeq %rax, %r12
 ; X64-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT:    cmpq %rbp, %rcx
+; X64-NEXT:    cmpq %r13, %rcx
 ; X64-NEXT:    movq $-1, %rax
-; X64-NEXT:    sbbq %r14, %rax
-; X64-NEXT:    cmovgeq %rcx, %rbp
-; X64-NEXT:    movq %rbp, %xmm0
+; X64-NEXT:    sbbq %r12, %rax
+; X64-NEXT:    cmovgeq %rcx, %r13
+; X64-NEXT:    movq %r13, %xmm0
 ; X64-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; X64-NEXT:    psrlq $1, %xmm1
 ; X64-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; X64-NEXT:    # xmm0 = mem[0,1,1,3]
-; X64-NEXT:    psllq $32, %xmm0
+; X64-NEXT:    pxor %xmm0, %xmm0
+; X64-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
 ; X64-NEXT:    psrad $31, %xmm1
 ; X64-NEXT:    psrlq $31, %xmm0
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT:    movq %xmm0, %rbx
-; X64-NEXT:    movq %rbx, %r13
-; X64-NEXT:    sarq $63, %r13
-; X64-NEXT:    shldq $31, %rbx, %r13
+; X64-NEXT:    movq %xmm0, %rbp
+; X64-NEXT:    movq %rbp, %r14
+; X64-NEXT:    sarq $63, %r14
+; X64-NEXT:    shldq $31, %rbp, %r14
+; X64-NEXT:    movq %rbp, %r15
+; X64-NEXT:    shlq $31, %r15
 ; X64-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X64-NEXT:    pxor %xmm1, %xmm1
 ; X64-NEXT:    pcmpgtd %xmm0, %xmm1
@@ -696,94 +698,92 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    movq %xmm0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    sarq $63, %r15
-; X64-NEXT:    movq %rbx, %r12
-; X64-NEXT:    shlq $31, %r12
-; X64-NEXT:    movq %r12, %rdi
-; X64-NEXT:    movq %r13, %rsi
-; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    sarq $63, %rbx
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    movq %rbx, %rcx
 ; X64-NEXT:    callq __divti3@PLT
-; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    subq $1, %rbp
-; X64-NEXT:    sbbq $0, %r14
-; X64-NEXT:    shrq $63, %rbx
-; X64-NEXT:    xorl %r15d, %ebx
-; X64-NEXT:    movq %r12, %rdi
-; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    subq $1, %r13
+; X64-NEXT:    sbbq $0, %r12
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    movq %r14, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    movq %rbx, %rcx
 ; X64-NEXT:    callq __modti3@PLT
 ; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    setne %al
+; X64-NEXT:    shrq $63, %rbp
+; X64-NEXT:    xorl %ebp, %ebx
 ; X64-NEXT:    testb %bl, %al
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
 ; X64-NEXT:    movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT:    cmpq %rcx, %rbp
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    cmpq %rcx, %r13
+; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    sbbq $0, %rax
+; X64-NEXT:    cmovgeq %rcx, %r13
 ; X64-NEXT:    movl $0, %eax
-; X64-NEXT:    cmovgeq %rax, %r14
-; X64-NEXT:    cmovgeq %rcx, %rbp
+; X64-NEXT:    cmovgeq %rax, %r12
 ; X64-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT:    cmpq %rbp, %rcx
+; X64-NEXT:    cmpq %r13, %rcx
 ; X64-NEXT:    movq $-1, %rax
-; X64-NEXT:    sbbq %r14, %rax
-; X64-NEXT:    cmovgeq %rcx, %rbp
-; X64-NEXT:    movq %rbp, %xmm0
+; X64-NEXT:    sbbq %r12, %rax
+; X64-NEXT:    cmovgeq %rcx, %r13
+; X64-NEXT:    movq %r13, %xmm0
 ; X64-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; X64-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; X64-NEXT:    # xmm0 = mem[2,3,2,3]
-; X64-NEXT:    movq %xmm0, %rbx
-; X64-NEXT:    movq %rbx, %r13
-; X64-NEXT:    sarq $63, %r13
-; X64-NEXT:    shldq $31, %rbx, %r13
+; X64-NEXT:    movq %xmm0, %rbp
+; X64-NEXT:    movq %rbp, %r14
+; X64-NEXT:    sarq $63, %r14
+; X64-NEXT:    shldq $31, %rbp, %r14
+; X64-NEXT:    movq %rbp, %r15
+; X64-NEXT:    shlq $31, %r15
 ; X64-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; X64-NEXT:    # xmm0 = mem[2,3,2,3]
 ; X64-NEXT:    movq %xmm0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    sarq $63, %r15
-; X64-NEXT:    movq %rbx, %r12
-; X64-NEXT:    shlq $31, %r12
-; X64-NEXT:    movq %r12, %rdi
-; X64-NEXT:    movq %r13, %rsi
-; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    sarq $63, %rbx
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    movq %rbx, %rcx
 ; X64-NEXT:    callq __divti3@PLT
-; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    subq $1, %rbp
-; X64-NEXT:    sbbq $0, %r14
-; X64-NEXT:    shrq $63, %rbx
-; X64-NEXT:    xorl %r15d, %ebx
-; X64-NEXT:    movq %r12, %rdi
-; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    subq $1, %r13
+; X64-NEXT:    sbbq $0, %r12
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    movq %r14, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %r15, %rcx
+; X64-NEXT:    movq %rbx, %rcx
 ; X64-NEXT:    callq __modti3@PLT
 ; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    setne %al
+; X64-NEXT:    shrq $63, %rbp
+; X64-NEXT:    xorl %ebp, %ebx
 ; X64-NEXT:    testb %bl, %al
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
 ; X64-NEXT:    movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT:    cmpq %rcx, %rbp
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    cmpq %rcx, %r13
+; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    sbbq $0, %rax
+; X64-NEXT:    cmovgeq %rcx, %r13
 ; X64-NEXT:    movl $0, %eax
-; X64-NEXT:    cmovgeq %rax, %r14
-; X64-NEXT:    cmovgeq %rcx, %rbp
+; X64-NEXT:    cmovgeq %rax, %r12
 ; X64-NEXT:    movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
-; X64-NEXT:    cmpq %rbp, %rax
-; X64-NEXT:    sbbq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT:    cmovgeq %rax, %rbp
-; X64-NEXT:    movq %rbp, %xmm1
+; X64-NEXT:    cmpq %r13, %rax
+; X64-NEXT:    sbbq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT:    cmovgeq %rax, %r13
+; X64-NEXT:    movq %r13, %xmm1
 ; X64-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; X64-NEXT:    psrlq $1, %xmm0
diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index 97c3c2040b29..a80d8d8cd01b 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -46,7 +46,6 @@ define void @failing(ptr %0, ptr %1) nounwind {
 ; CHECK-NEXT:    movq 24(%rsi), %rcx
 ; CHECK-NEXT:    movq 32(%rsi), %rdx
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
-; CHECK-NEXT:    xorl %esi, %esi
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [1,1]
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [2,2]
 ; CHECK-NEXT:    .p2align 4, 0x90
@@ -54,39 +53,45 @@ define void @failing(ptr %0, ptr %1) nounwind {
 ; CHECK-NEXT:    # =>This Loop Header: Depth=1
 ; CHECK-NEXT:    # Child Loop BB0_2 Depth 2
 ; CHECK-NEXT:    xorpd %xmm3, %xmm3
-; CHECK-NEXT:    movq $-1024, %rdi # imm = 0xFC00
+; CHECK-NEXT:    movq $-1024, %rsi # imm = 0xFC00
 ; CHECK-NEXT:    movdqa %xmm0, %xmm4
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_2: # %vector.body
 ; CHECK-NEXT:    # Parent Loop BB0_1 Depth=1
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    cmpq 1024(%rdx,%rdi), %rsi
-; CHECK-NEXT:    movq %rcx, %r8
-; CHECK-NEXT:    sbbq 1032(%rdx,%rdi), %r8
-; CHECK-NEXT:    setge %r8b
-; CHECK-NEXT:    movzbl %r8b, %r8d
-; CHECK-NEXT:    andl $1, %r8d
+; CHECK-NEXT:    movdqu 1024(%rdx,%rsi), %xmm5
+; CHECK-NEXT:    movdqu 1040(%rdx,%rsi), %xmm6
+; CHECK-NEXT:    movq %xmm5, %rdi
+; CHECK-NEXT:    movq %xmm6, %r8
+; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; CHECK-NEXT:    movq %xmm5, %r9
+; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
+; CHECK-NEXT:    movq %xmm5, %r10
 ; CHECK-NEXT:    negq %r8
-; CHECK-NEXT:    movq %r8, %xmm5
-; CHECK-NEXT:    cmpq 1040(%rdx,%rdi), %rsi
 ; CHECK-NEXT:    movq %rcx, %r8
-; CHECK-NEXT:    sbbq 1048(%rdx,%rdi), %r8
+; CHECK-NEXT:    sbbq %r10, %r8
 ; CHECK-NEXT:    setge %r8b
 ; CHECK-NEXT:    movzbl %r8b, %r8d
-; CHECK-NEXT:    andl $1, %r8d
 ; CHECK-NEXT:    negq %r8
-; CHECK-NEXT:    movq %r8, %xmm6
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
-; CHECK-NEXT:    movdqa %xmm1, %xmm6
-; CHECK-NEXT:    psllq %xmm4, %xmm6
+; CHECK-NEXT:    movq %r8, %xmm5
+; CHECK-NEXT:    negq %rdi
+; CHECK-NEXT:    movq %rcx, %rdi
+; CHECK-NEXT:    sbbq %r9, %rdi
+; CHECK-NEXT:    setge %dil
+; CHECK-NEXT:    movzbl %dil, %edi
+; CHECK-NEXT:    negq %rdi
+; CHECK-NEXT:    movq %rdi, %xmm6
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
+; CHECK-NEXT:    movdqa %xmm1, %xmm5
+; CHECK-NEXT:    psllq %xmm4, %xmm5
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3]
 ; CHECK-NEXT:    movdqa %xmm1, %xmm8
 ; CHECK-NEXT:    psllq %xmm7, %xmm8
-; CHECK-NEXT:    movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1]
-; CHECK-NEXT:    andpd %xmm5, %xmm8
+; CHECK-NEXT:    movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1]
+; CHECK-NEXT:    andpd %xmm6, %xmm8
 ; CHECK-NEXT:    orpd %xmm8, %xmm3
 ; CHECK-NEXT:    paddq %xmm2, %xmm4
-; CHECK-NEXT:    addq $32, %rdi
+; CHECK-NEXT:    addq $32, %rsi
 ; CHECK-NEXT:    jne .LBB0_2
 ; CHECK-NEXT:  # %bb.3: # %middle.block
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
@@ -101,7 +106,6 @@ define void @failing(ptr %0, ptr %1) nounwind {
 ; CHECK-AVX2-NEXT:    movq 24(%rsi), %rcx
 ; CHECK-AVX2-NEXT:    movq 32(%rsi), %rdx
 ; CHECK-AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm0 = [0,1]
-; CHECK-AVX2-NEXT:    xorl %esi, %esi
 ; CHECK-AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [1,1]
 ; CHECK-AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [2,2]
 ; CHECK-AVX2-NEXT:    .p2align 4, 0x90
@@ -109,34 +113,40 @@ define void @failing(ptr %0, ptr %1) nounwind {
 ; CHECK-AVX2-NEXT:    # =>This Loop Header: Depth=1
 ; CHECK-AVX2-NEXT:    # Child Loop BB0_2 Depth 2
 ; CHECK-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX2-NEXT:    movq $-1024, %rdi # imm = 0xFC00
+; CHECK-AVX2-NEXT:    movq $-1024, %rsi # imm = 0xFC00
 ; CHECK-AVX2-NEXT:    vmovdqa %xmm0, %xmm4
 ; CHECK-AVX2-NEXT:    .p2align 4, 0x90
 ; CHECK-AVX2-NEXT:  .LBB0_2: # %vector.body
 ; CHECK-AVX2-NEXT:    # Parent Loop BB0_1 Depth=1
 ; CHECK-AVX2-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-AVX2-NEXT:    cmpq 1024(%rdx,%rdi), %rsi
-; CHECK-AVX2-NEXT:    movq %rcx, %r8
-; CHECK-AVX2-NEXT:    sbbq 1032(%rdx,%rdi), %r8
+; CHECK-AVX2-NEXT:    vmovdqu 1024(%rdx,%rsi), %xmm5
+; CHECK-AVX2-NEXT:    vmovdqu 1040(%rdx,%rsi), %xmm6
+; CHECK-AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm5[0],xmm6[0]
+; CHECK-AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; CHECK-AVX2-NEXT:    vmovq %xmm5, %rdi
+; CHECK-AVX2-NEXT:    vpextrq $1, %xmm5, %r8
+; CHECK-AVX2-NEXT:    vmovq %xmm7, %r9
+; CHECK-AVX2-NEXT:    vpextrq $1, %xmm7, %r10
+; CHECK-AVX2-NEXT:    negq %r10
+; CHECK-AVX2-NEXT:    movq %rcx, %r10
+; CHECK-AVX2-NEXT:    sbbq %r8, %r10
 ; CHECK-AVX2-NEXT:    setge %r8b
 ; CHECK-AVX2-NEXT:    movzbl %r8b, %r8d
-; CHECK-AVX2-NEXT:    andl $1, %r8d
 ; CHECK-AVX2-NEXT:    negq %r8
 ; CHECK-AVX2-NEXT:    vmovq %r8, %xmm5
-; CHECK-AVX2-NEXT:    cmpq 1040(%rdx,%rdi), %rsi
+; CHECK-AVX2-NEXT:    negq %r9
 ; CHECK-AVX2-NEXT:    movq %rcx, %r8
-; CHECK-AVX2-NEXT:    sbbq 1048(%rdx,%rdi), %r8
-; CHECK-AVX2-NEXT:    setge %r8b
-; CHECK-AVX2-NEXT:    movzbl %r8b, %r8d
-; CHECK-AVX2-NEXT:    andl $1, %r8d
-; CHECK-AVX2-NEXT:    negq %r8
-; CHECK-AVX2-NEXT:    vmovq %r8, %xmm6
-; CHECK-AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
+; CHECK-AVX2-NEXT:    sbbq %rdi, %r8
+; CHECK-AVX2-NEXT:    setge %dil
+; CHECK-AVX2-NEXT:    movzbl %dil, %edi
+; CHECK-AVX2-NEXT:    negq %rdi
+; CHECK-AVX2-NEXT:    vmovq %rdi, %xmm6
+; CHECK-AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0]
 ; CHECK-AVX2-NEXT:    vpsllvq %xmm4, %xmm1, %xmm6
 ; CHECK-AVX2-NEXT:    vpand %xmm6, %xmm5, %xmm5
 ; CHECK-AVX2-NEXT:    vpor %xmm3, %xmm5, %xmm3
 ; CHECK-AVX2-NEXT:    vpaddq %xmm2, %xmm4, %xmm4
-; CHECK-AVX2-NEXT:    addq $32, %rdi
+; CHECK-AVX2-NEXT:    addq $32, %rsi
 ; CHECK-AVX2-NEXT:    jne .LBB0_2
 ; CHECK-AVX2-NEXT:  # %bb.3: # %middle.block
 ; CHECK-AVX2-NEXT:    # in Loop: Header=BB0_1 Depth=1
diff --git a/llvm/test/CodeGen/X86/subreg-to-reg-1.ll b/llvm/test/CodeGen/X86/subreg-to-reg-1.ll
index 8acdb6176f57..49bff9e075e5 100644
--- a/llvm/test/CodeGen/X86/subreg-to-reg-1.ll
+++ b/llvm/test/CodeGen/X86/subreg-to-reg-1.ll
@@ -1,14 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
 
-; CHECK:     {{leal	.*[)], %e.*}}
-; CHECK-NOT: {{leal	.*[)], %e.*}}
-
 ; Don't eliminate or coalesce away the explicit zero-extension!
 ; This is currently using an leal because of a 3-addressification detail,
 ; though this isn't necessary; The point of this test is to make sure
 ; a 32-bit add is used.
 
 define i64 @foo(i64 %a) nounwind {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    leal -1(%rdi), %eax
+; CHECK-NEXT:    incq %rax
+; CHECK-NEXT:    retq
   %b = add i64 %a, 4294967295
   %c = and i64 %b, 4294967295
   %d = add i64 %c, 1
diff --git a/llvm/test/CodeGen/X86/subreg-to-reg-3.ll b/llvm/test/CodeGen/X86/subreg-to-reg-3.ll
index db9d0d12c3d7..2bd5ca1716af 100644
--- a/llvm/test/CodeGen/X86/subreg-to-reg-3.ll
+++ b/llvm/test/CodeGen/X86/subreg-to-reg-3.ll
@@ -1,10 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
 
-; CHECK: imull
-
 ; Don't eliminate or coalesce away the explicit zero-extension!
 
 define i64 @foo(i64 %a) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    imull $7823, %edi, %eax # imm = 0x1E8F
+; CHECK-NEXT:    incq %rax
+; CHECK-NEXT:    retq
   %b = mul i64 %a, 7823
   %c = and i64 %b, 4294967295
   %d = add i64 %c, 1
diff --git a/llvm/test/CodeGen/X86/subreg-to-reg-6.ll b/llvm/test/CodeGen/X86/subreg-to-reg-6.ll
index 7a6f78fac368..f0dc17b55661 100644
--- a/llvm/test/CodeGen/X86/subreg-to-reg-6.ll
+++ b/llvm/test/CodeGen/X86/subreg-to-reg-6.ll
@@ -1,6 +1,18 @@
-; RUN: llc < %s -mtriple=x86_64--
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
 
 define i64 @foo() nounwind {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpl $12, 0
+; CHECK-NEXT:    je .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %bb65
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_1: # %bb56
 entry:
 	%t0 = load i32, ptr null, align 8
 	switch i32 %t0, label %bb65 [
@@ -22,6 +34,14 @@ bb65:
 }
 
 define i64 @bar(i64 %t0) nounwind {
+; CHECK-LABEL: bar:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    negl %eax
+; CHECK-NEXT:    retq
 	call void asm "", "{cx}"(i64 0) nounwind
 	%t1 = sub i64 0, %t0
 	%t2 = and i64 %t1, 4294967295
diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
index e0f438eb7cc8..ae66c5420638 100644
--- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
+++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
@@ -3060,12 +3060,7 @@ define void @vec384_v3i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR:       # %bb.0:
 ; SCALAR-NEXT:    movl 8(%rdi), %eax
 ; SCALAR-NEXT:    movq (%rdi), %rcx
-; SCALAR-NEXT:    movq %rcx, %rdi
-; SCALAR-NEXT:    shrq $32, %rdi
-; SCALAR-NEXT:    notl %edi
-; SCALAR-NEXT:    shlq $32, %rdi
-; SCALAR-NEXT:    notl %ecx
-; SCALAR-NEXT:    orq %rdi, %rcx
+; SCALAR-NEXT:    notq %rcx
 ; SCALAR-NEXT:    notl %eax
 ; SCALAR-NEXT:    movl %eax, 8(%rsi)
 ; SCALAR-NEXT:    movq %rcx, (%rsi)
@@ -3196,12 +3191,7 @@ define void @vec384_v3f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR:       # %bb.0:
 ; SCALAR-NEXT:    movl 8(%rdi), %eax
 ; SCALAR-NEXT:    movq (%rdi), %rcx
-; SCALAR-NEXT:    movq %rcx, %rdi
-; SCALAR-NEXT:    shrq $32, %rdi
-; SCALAR-NEXT:    notl %edi
-; SCALAR-NEXT:    shlq $32, %rdi
-; SCALAR-NEXT:    notl %ecx
-; SCALAR-NEXT:    orq %rdi, %rcx
+; SCALAR-NEXT:    notq %rcx
 ; SCALAR-NEXT:    notl %eax
 ; SCALAR-NEXT:    movl %eax, 8(%rsi)
 ; SCALAR-NEXT:    movq %rcx, (%rsi)
@@ -4216,25 +4206,10 @@ define void @vec384_v6i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR:       # %bb.0:
 ; SCALAR-NEXT:    movq (%rdi), %rax
 ; SCALAR-NEXT:    movq 8(%rdi), %rcx
-; SCALAR-NEXT:    movq %rax, %r8
-; SCALAR-NEXT:    shrq $32, %r8
-; SCALAR-NEXT:    movq %rcx, %r9
-; SCALAR-NEXT:    shrq $32, %r9
 ; SCALAR-NEXT:    movq 16(%rdi), %rdi
-; SCALAR-NEXT:    movq %rdi, %r10
-; SCALAR-NEXT:    shrq $32, %r10
-; SCALAR-NEXT:    notl %r10d
-; SCALAR-NEXT:    shlq $32, %r10
-; SCALAR-NEXT:    notl %edi
-; SCALAR-NEXT:    orq %r10, %rdi
-; SCALAR-NEXT:    notl %r9d
-; SCALAR-NEXT:    shlq $32, %r9
-; SCALAR-NEXT:    notl %ecx
-; SCALAR-NEXT:    orq %r9, %rcx
-; SCALAR-NEXT:    notl %r8d
-; SCALAR-NEXT:    shlq $32, %r8
-; SCALAR-NEXT:    notl %eax
-; SCALAR-NEXT:    orq %r8, %rax
+; SCALAR-NEXT:    notq %rdi
+; SCALAR-NEXT:    notq %rcx
+; SCALAR-NEXT:    notq %rax
 ; SCALAR-NEXT:    movq %rax, (%rsi)
 ; SCALAR-NEXT:    movq %rcx, 8(%rsi)
 ; SCALAR-NEXT:    movq %rdi, 16(%rsi)
@@ -4303,25 +4278,10 @@ define void @vec384_v6f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; SCALAR:       # %bb.0:
 ; SCALAR-NEXT:    movq (%rdi), %rax
 ; SCALAR-NEXT:    movq 8(%rdi), %rcx
-; SCALAR-NEXT:    movq %rax, %r8
-; SCALAR-NEXT:    shrq $32, %r8
-; SCALAR-NEXT:    movq %rcx, %r9
-; SCALAR-NEXT:    shrq $32, %r9
 ; SCALAR-NEXT:    movq 16(%rdi), %rdi
-; SCALAR-NEXT:    movq %rdi, %r10
-; SCALAR-NEXT:    shrq $32, %r10
-; SCALAR-NEXT:    notl %r10d
-; SCALAR-NEXT:    shlq $32, %r10
-; SCALAR-NEXT:    notl %edi
-; SCALAR-NEXT:    orq %r10, %rdi
-; SCALAR-NEXT:    notl %r9d
-; SCALAR-NEXT:    shlq $32, %r9
-; SCALAR-NEXT:    notl %ecx
-; SCALAR-NEXT:    orq %r9, %rcx
-; SCALAR-NEXT:    notl %r8d
-; SCALAR-NEXT:    shlq $32, %r8
-; SCALAR-NEXT:    notl %eax
-; SCALAR-NEXT:    orq %r8, %rax
+; SCALAR-NEXT:    notq %rdi
+; SCALAR-NEXT:    notq %rcx
+; SCALAR-NEXT:    notq %rax
 ; SCALAR-NEXT:    movq %rax, (%rsi)
 ; SCALAR-NEXT:    movq %rcx, 8(%rsi)
 ; SCALAR-NEXT:    movq %rdi, 16(%rsi)
diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index cee30f5fe5da..460c5fe11f82 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -1045,16 +1045,12 @@ define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: saddo_v4i1:
 ; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0
-; AVX512-NEXT:    vpslld $31, %xmm1, %xmm0
-; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k2
-; AVX512-NEXT:    kandw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    kshiftlw $12, %k2, %k0
-; AVX512-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll
index 64ed08104885..d06993da6365 100644
--- a/llvm/test/CodeGen/X86/vec_ssubo.ll
+++ b/llvm/test/CodeGen/X86/vec_ssubo.ll
@@ -1062,16 +1062,12 @@ define <4 x i32> @ssubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: ssubo_v4i1:
 ; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k0
+; AVX512-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0
-; AVX512-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX512-NEXT:    vptestmd %xmm1, %xmm1, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k0
-; AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k1 {%k1}
-; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index 950e943bd902..bac118095331 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -1098,16 +1098,12 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: uaddo_v4i1:
 ; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0
-; AVX512-NEXT:    vpslld $31, %xmm1, %xmm0
-; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k2
-; AVX512-NEXT:    kandw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    kshiftlw $12, %k2, %k0
-; AVX512-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll
index 7de972770d8d..ab75ada72f25 100644
--- a/llvm/test/CodeGen/X86/vec_usubo.ll
+++ b/llvm/test/CodeGen/X86/vec_usubo.ll
@@ -1145,16 +1145,12 @@ define <4 x i32> @usubo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: usubo_v4i1:
 ; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k0
+; AVX512-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0
-; AVX512-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX512-NEXT:    vptestmd %xmm1, %xmm1, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k0
-; AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k1 {%k1}
-; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll
index 78797b9acc2e..27aaad6353ed 100644
--- a/llvm/test/CodeGen/X86/vector-bo-select.ll
+++ b/llvm/test/CodeGen/X86/vector-bo-select.ll
@@ -3137,11 +3137,11 @@ define <8 x i64> @mul_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef
 ; AVX512-LABEL: mul_v8i64_cast_cond:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    kmovw %edi, %k1
-; AVX512-NEXT:    vpsrlq $32, %zmm1, %zmm2
-; AVX512-NEXT:    vpmuludq %zmm2, %zmm0, %zmm2
-; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm3
-; AVX512-NEXT:    vpmuludq %zmm1, %zmm3, %zmm3
-; AVX512-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm2
+; AVX512-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
+; AVX512-NEXT:    vpsrlq $32, %zmm1, %zmm3
+; AVX512-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
 ; AVX512-NEXT:    vpsllq $32, %zmm2, %zmm2
 ; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm1
 ; AVX512-NEXT:    vpaddq %zmm2, %zmm1, %zmm0 {%k1}
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index b839452725a9..3aaa9268a8d8 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -58,12 +58,12 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
 ; SSE41-NEXT:    psrlq %xmm4, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7]
 ; SSE41-NEXT:    pandn %xmm3, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
 ; SSE41-NEXT:    paddq %xmm0, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    psllq %xmm1, %xmm3
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psllq %xmm2, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
 ; SSE41-NEXT:    psllq %xmm2, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
 ; SSE41-NEXT:    por %xmm5, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -76,11 +76,11 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
 ; AVX1-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
 ; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
 ; AVX1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpsllq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
 ; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -158,13 +158,13 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
 ; XOPAVX1-LABEL: var_funnnel_v2i64:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
-; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpsubq %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT:    vpshlq %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshlq %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -366,13 +366,13 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
 ; XOPAVX1-LABEL: var_funnnel_v4i32:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
-; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT:    vpshld %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; XOPAVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpshld %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshld %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpshld %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -646,26 +646,26 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
 ; XOPAVX1-LABEL: var_funnnel_v8i16:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpsubw %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT:    vpshlw %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; XOPAVX1-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpshlw %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshlw %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpshlw %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: var_funnnel_v8i16:
 ; XOPAVX2:       # %bb.0:
 ; XOPAVX2-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; XOPAVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX2-NEXT:    vpsubw %xmm4, %xmm5, %xmm4
-; XOPAVX2-NEXT:    vpshlw %xmm4, %xmm1, %xmm1
-; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; XOPAVX2-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
-; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpshlw %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    retq
 ;
@@ -995,26 +995,26 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
 ; XOPAVX1-LABEL: var_funnnel_v16i8:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpsubb %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; XOPAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; XOPAVX2-LABEL: var_funnnel_v16i8:
 ; XOPAVX2:       # %bb.0:
 ; XOPAVX2-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; XOPAVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX2-NEXT:    vpsubb %xmm4, %xmm5, %xmm4
-; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
-; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; XOPAVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
-; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
+; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm1, %xmm1
 ; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 7b6b0ea83c7e..fc65f759f5fb 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -486,22 +486,22 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
 ; XOPAVX2-LABEL: var_funnnel_v16i16:
 ; XOPAVX2:       # %bb.0:
 ; XOPAVX2-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; XOPAVX2-NEXT:    vpandn %ymm3, %ymm2, %ymm4
 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; XOPAVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; XOPAVX2-NEXT:    vpsubw %xmm5, %xmm6, %xmm5
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm7
-; XOPAVX2-NEXT:    vpshlw %xmm5, %xmm7, %xmm5
-; XOPAVX2-NEXT:    vpsubw %xmm4, %xmm6, %xmm4
-; XOPAVX2-NEXT:    vpshlw %xmm4, %xmm1, %xmm1
-; XOPAVX2-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm1
-; XOPAVX2-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; XOPAVX2-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
-; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; XOPAVX2-NEXT:    vpshlw %xmm3, %xmm4, %xmm3
-; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm0, %xmm0
-; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm6
+; XOPAVX2-NEXT:    vpshlw %xmm5, %xmm6, %xmm5
+; XOPAVX2-NEXT:    vpshlw %xmm4, %xmm0, %xmm0
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; XOPAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; XOPAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; XOPAVX2-NEXT:    vpsubw %xmm3, %xmm4, %xmm3
+; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
+; XOPAVX2-NEXT:    vpshlw %xmm3, %xmm5, %xmm3
+; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm4, %xmm2
+; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm1, %xmm1
+; XOPAVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    retq
   %res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt)
diff --git a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
index 0426c48aecfc..a6067a960fc0 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
@@ -185,13 +185,13 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt)
 ; XOPAVX1-LABEL: var_funnnel_v2i32:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
-; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpsubd %xmm4, %xmm5, %xmm4
-; XOPAVX1-NEXT:    vpshld %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; XOPAVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpshld %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpshld %xmm4, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpshld %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
index c54da38ef10c..75baba5f35f7 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -927,9 +927,9 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psllq $7, %xmm1
-; SSE2-NEXT:    paddq %xmm0, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE2-NEXT:    paddq %xmm0, %xmm1
+; SSE2-NEXT:    psllq $7, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v2i64:
@@ -975,9 +975,9 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; X86-SSE-LABEL: constant_shift_v2i64:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE-NEXT:    psllq $7, %xmm1
-; X86-SSE-NEXT:    paddq %xmm0, %xmm0
-; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; X86-SSE-NEXT:    paddq %xmm0, %xmm1
+; X86-SSE-NEXT:    psllq $7, %xmm0
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; X86-SSE-NEXT:    retl
   %shift = shl <2 x i64> %a, <i64 1, i64 7>
   ret <2 x i64> %shift
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vscale.ll b/llvm/test/Instrumentation/MemorySanitizer/vscale.ll
new file mode 100644
index 000000000000..e1a4a9b7aa68
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/vscale.ll
@@ -0,0 +1,188 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -S -msan-check-access-address=0 -passes="msan" 2>&1 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test_load_store_i32(ptr %a, ptr %b) sanitize_memory {
+; CHECK-LABEL: define void @test_load_store_i32(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[A]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store <vscale x 4 x i32> [[_MSLD]], ptr [[TMP7]], align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP1]], ptr [[B]], align 16
+; CHECK-NEXT:    ret void
+;
+  %1 = load <vscale x 4 x i32>, ptr %a
+  store <vscale x 4 x i32> %1, ptr %b
+  ret void
+}
+
+define void @test_load_store_add_int(ptr %a, ptr %b) sanitize_memory {
+; CHECK-LABEL: define void @test_load_store_add_int(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 8 x i64>, ptr [[A]], align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <vscale x 8 x i64>, ptr [[TMP4]], align 64
+; CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 8 x i64>, ptr [[B]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD1:%.*]] = load <vscale x 8 x i64>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <vscale x 8 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 8 x i64> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 87960930222080
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    store <vscale x 8 x i64> [[_MSLD1]], ptr [[TMP12]], align 64
+; CHECK-NEXT:    store <vscale x 8 x i64> [[TMP5]], ptr [[B]], align 64
+; CHECK-NEXT:    ret void
+;
+  %1 = load <vscale x 8 x i64>, ptr %a
+  %2 = load <vscale x 8 x i64>, ptr %b
+  %3 = add <vscale x 8 x i64> %1, %2
+  store <vscale x 8 x i64> %2, ptr %b
+  ret void
+}
+
+define void @test_load_store_float(ptr %a, ptr %b) sanitize_memory {
+; CHECK-LABEL: define void @test_load_store_float(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x float>, ptr [[A]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store <vscale x 4 x i32> [[_MSLD]], ptr [[TMP7]], align 16
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP1]], ptr [[B]], align 16
+; CHECK-NEXT:    ret void
+;
+  %1 = load <vscale x 4 x float>, ptr %a
+  store <vscale x 4 x float> %1, ptr %b
+  ret void
+}
+
+define void @test_load_store_add_float(ptr %a, ptr %b) sanitize_memory {
+; CHECK-LABEL: define void @test_load_store_add_float(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x float>, ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <vscale x 2 x float>, ptr [[B]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD1:%.*]] = load <vscale x 2 x i32>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <vscale x 2 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <vscale x 2 x float> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 87960930222080
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    store <vscale x 2 x i32> [[_MSLD1]], ptr [[TMP12]], align 8
+; CHECK-NEXT:    store <vscale x 2 x float> [[TMP5]], ptr [[B]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = load <vscale x 2 x float>, ptr %a
+  %2 = load <vscale x 2 x float>, ptr %b
+  %3 = fadd <vscale x 2 x float> %1, %2
+  store <vscale x 2 x float> %2, ptr %b
+  ret void
+}
+
+define <vscale x 2 x float> @fn_ret(ptr %a) sanitize_memory {
+; CHECK-LABEL: define <vscale x 2 x float> @fn_ret(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 2 x float>, ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i32> [[_MSLD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <vscale x 2 x float> [[TMP1]]
+;
+  %1 = load <vscale x 2 x float>, ptr %a
+  ret <vscale x 2 x float> %1
+}
+
+define void @test_ret(ptr %a, ptr %b) sanitize_memory {
+; CHECK-LABEL: define void @test_ret(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i64 [[TMP1]], ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store <vscale x 2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x float> @fn_ret(ptr [[A]])
+; CHECK-NEXT:    [[_MSRET:%.*]] = load <vscale x 2 x i32>, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    store <vscale x 2 x i32> [[_MSRET]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    store <vscale x 2 x float> [[TMP5]], ptr [[B]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = call <vscale x 2 x float> @fn_ret(ptr %a)
+  store <vscale x 2 x float> %1, ptr %b
+  ret void
+}
+
+define void @fn_param(<vscale x 2 x float> %a, ptr %b) sanitize_memory {
+; CHECK-LABEL: define void @fn_param(
+; CHECK-SAME: <vscale x 2 x float> [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    store <vscale x 2 x i32> zeroinitializer, ptr [[TMP3]], align 8
+; CHECK-NEXT:    store <vscale x 2 x float> [[A]], ptr [[B]], align 8
+; CHECK-NEXT:    ret void
+;
+  store <vscale x 2 x float> %a, ptr %b
+  ret void
+}
+
+define void @test_param(ptr %a, ptr %b) sanitize_memory {
+; CHECK-LABEL: define void @test_param(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x float>, ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    store i64 [[TMP1]], ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.nxv2i32(<vscale x 2 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP6]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    call void @fn_param(<vscale x 2 x float> [[TMP2]], ptr [[B]])
+; CHECK-NEXT:    ret void
+;
+  %1 = load <vscale x 2 x float>, ptr %a
+  call void @fn_param(<vscale x 2 x float> %1, ptr %b)
+  ret void
+}
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/MC/AArch64/SME2p1/fadd-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fadd-diagnostics.s
index c13a1be05b1c..a18989880a34 100644
--- a/llvm/test/MC/AArch64/SME2p1/fadd-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fadd-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Out of range index offset
diff --git a/llvm/test/MC/AArch64/SME2p1/fadd.s b/llvm/test/MC/AArch64/SME2p1/fadd.s
index a8e64a63dbdb..bdb769093c83 100644
--- a/llvm/test/MC/AArch64/SME2p1/fadd.s
+++ b/llvm/test/MC/AArch64/SME2p1/fadd.s
@@ -1,300 +1,302 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fadd    za.h[w8, 0, vgx2], {z0.h, z1.h}  // 11000001-10100100-00011100-00000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x00,0x1c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41c00 <unknown>
 
 fadd    za.h[w8, 0], {z0.h - z1.h}  // 11000001-10100100-00011100-00000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x00,0x1c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41c00 <unknown>
 
 fadd    za.h[w10, 5, vgx2], {z10.h, z11.h}  // 11000001-10100100-01011101-01000101
 // CHECK-INST: fadd    za.h[w10, 5, vgx2], { z10.h, z11.h }
 // CHECK-ENCODING: [0x45,0x5d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45d45 <unknown>
 
 fadd    za.h[w10, 5], {z10.h - z11.h}  // 11000001-10100100-01011101-01000101
 // CHECK-INST: fadd    za.h[w10, 5, vgx2], { z10.h, z11.h }
 // CHECK-ENCODING: [0x45,0x5d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45d45 <unknown>
 
 fadd    za.h[w11, 7, vgx2], {z12.h, z13.h}  // 11000001-10100100-01111101-10000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x87,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47d87 <unknown>
 
 fadd    za.h[w11, 7], {z12.h - z13.h}  // 11000001-10100100-01111101-10000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x87,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47d87 <unknown>
 
 fadd    za.h[w11, 7, vgx2], {z30.h, z31.h}  // 11000001-10100100-01111111-11000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx2], { z30.h, z31.h }
 // CHECK-ENCODING: [0xc7,0x7f,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47fc7 <unknown>
 
 fadd    za.h[w11, 7], {z30.h - z31.h}  // 11000001-10100100-01111111-11000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx2], { z30.h, z31.h }
 // CHECK-ENCODING: [0xc7,0x7f,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47fc7 <unknown>
 
 fadd    za.h[w8, 5, vgx2], {z16.h, z17.h}  // 11000001-10100100-00011110-00000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx2], { z16.h, z17.h }
 // CHECK-ENCODING: [0x05,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41e05 <unknown>
 
 fadd    za.h[w8, 5], {z16.h - z17.h}  // 11000001-10100100-00011110-00000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx2], { z16.h, z17.h }
 // CHECK-ENCODING: [0x05,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41e05 <unknown>
 
 fadd    za.h[w8, 1, vgx2], {z0.h, z1.h}  // 11000001-10100100-00011100-00000001
 // CHECK-INST: fadd    za.h[w8, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x01,0x1c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41c01 <unknown>
 
 fadd    za.h[w8, 1], {z0.h - z1.h}  // 11000001-10100100-00011100-00000001
 // CHECK-INST: fadd    za.h[w8, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x01,0x1c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41c01 <unknown>
 
 fadd    za.h[w10, 0, vgx2], {z18.h, z19.h}  // 11000001-10100100-01011110, 01000000
 // CHECK-INST: fadd    za.h[w10, 0, vgx2], { z18.h, z19.h }
 // CHECK-ENCODING: [0x40,0x5e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45e40 <unknown>
 
 fadd    za.h[w10, 0], {z18.h - z19.h}  // 11000001-10100100-01011110-01000000
 // CHECK-INST: fadd    za.h[w10, 0, vgx2], { z18.h, z19.h }
 // CHECK-ENCODING: [0x40,0x5e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45e40 <unknown>
 
 fadd    za.h[w8, 0, vgx2], {z12.h, z13.h}  // 11000001-10100100-00011101-10000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x80,0x1d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41d80 <unknown>
 
 fadd    za.h[w8, 0], {z12.h - z13.h}  // 11000001-10100100-00011101-10000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x80,0x1d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41d80 <unknown>
 
 fadd    za.h[w10, 1, vgx2], {z0.h, z1.h}  // 11000001-10100100-01011100-00000001
 // CHECK-INST: fadd    za.h[w10, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x01,0x5c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45c01 <unknown>
 
 fadd    za.h[w10, 1], {z0.h - z1.h}  // 11000001-10100100-01011100-00000001
 // CHECK-INST: fadd    za.h[w10, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x01,0x5c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45c01 <unknown>
 
 fadd    za.h[w8, 5, vgx2], {z22.h, z23.h}  // 11000001-10100100-00011110, 11000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx2], { z22.h, z23.h }
 // CHECK-ENCODING: [0xc5,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41ec5 <unknown>
 
 fadd    za.h[w8, 5], {z22.h - z23.h}  // 11000001-10100100-00011110-11000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx2], { z22.h, z23.h }
 // CHECK-ENCODING: [0xc5,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41ec5 <unknown>
 
 fadd    za.h[w11, 2, vgx2], {z8.h, z9.h}  // 11000001-10100100-01111101-00000010
 // CHECK-INST: fadd    za.h[w11, 2, vgx2], { z8.h, z9.h }
 // CHECK-ENCODING: [0x02,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47d02 <unknown>
 
 fadd    za.h[w11, 2], {z8.h - z9.h}  // 11000001-10100100-01111101-00000010
 // CHECK-INST: fadd    za.h[w11, 2, vgx2], { z8.h, z9.h }
 // CHECK-ENCODING: [0x02,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47d02 <unknown>
 
 fadd    za.h[w9, 7, vgx2], {z12.h, z13.h}  // 11000001-10100100-00111101-10000111
 // CHECK-INST: fadd    za.h[w9, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x87,0x3d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a43d87 <unknown>
 
 fadd    za.h[w9, 7], {z12.h - z13.h}  // 11000001-10100100-00111101-10000111
 // CHECK-INST: fadd    za.h[w9, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x87,0x3d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a43d87 <unknown>
 
 fadd    za.h[w8, 0, vgx4], {z0.h - z3.h}  // 11000001-10100101-00011100-00000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x00,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51c00 <unknown>
 
 fadd    za.h[w8, 0], {z0.h - z3.h}  // 11000001-10100101-00011100-00000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x00,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51c00 <unknown>
 
 fadd    za.h[w10, 5, vgx4], {z8.h - z11.h}  // 11000001-10100101-01011101-00000101
 // CHECK-INST: fadd    za.h[w10, 5, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x05,0x5d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55d05 <unknown>
 
 fadd    za.h[w10, 5], {z8.h - z11.h}  // 11000001-10100101-01011101-00000101
 // CHECK-INST: fadd    za.h[w10, 5, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x05,0x5d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55d05 <unknown>
 
 fadd    za.h[w11, 7, vgx4], {z12.h - z15.h}  // 11000001-10100101-01111101-10000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x87,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57d87 <unknown>
 
 fadd    za.h[w11, 7], {z12.h - z15.h}  // 11000001-10100101-01111101-10000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x87,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57d87 <unknown>
 
 fadd    za.h[w11, 7, vgx4], {z28.h - z31.h}  // 11000001-10100101-01111111-10000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx4], { z28.h - z31.h }
 // CHECK-ENCODING: [0x87,0x7f,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57f87 <unknown>
 
 fadd    za.h[w11, 7], {z28.h - z31.h}  // 11000001-10100101-01111111-10000111
 // CHECK-INST: fadd    za.h[w11, 7, vgx4], { z28.h - z31.h }
 // CHECK-ENCODING: [0x87,0x7f,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57f87 <unknown>
 
 fadd    za.h[w8, 5, vgx4], {z16.h - z19.h}  // 11000001-10100101-00011110-00000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x05,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51e05 <unknown>
 
 fadd    za.h[w8, 5], {z16.h - z19.h}  // 11000001-10100101-00011110-00000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x05,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51e05 <unknown>
 
 fadd    za.h[w8, 1, vgx4], {z0.h - z3.h}  // 11000001-10100101-00011100-00000001
 // CHECK-INST: fadd    za.h[w8, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x01,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51c01 <unknown>
 
 fadd    za.h[w8, 1], {z0.h - z3.h}  // 11000001-10100101-00011100-00000001
 // CHECK-INST: fadd    za.h[w8, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x01,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51c01 <unknown>
 
 fadd    za.h[w10, 0, vgx4], {z16.h - z19.h}  // 11000001-10100101-01011110-00000000
 // CHECK-INST: fadd    za.h[w10, 0, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x00,0x5e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55e00 <unknown>
 
 fadd    za.h[w10, 0], {z16.h - z19.h}  // 11000001-10100101-01011110-00000000
 // CHECK-INST: fadd    za.h[w10, 0, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x00,0x5e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55e00 <unknown>
 
 fadd    za.h[w8, 0, vgx4], {z12.h - z15.h}  // 11000001-10100101-00011101-10000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x80,0x1d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51d80 <unknown>
 
 fadd    za.h[w8, 0], {z12.h - z15.h}  // 11000001-10100101-00011101-10000000
 // CHECK-INST: fadd    za.h[w8, 0, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x80,0x1d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51d80 <unknown>
 
 fadd    za.h[w10, 1, vgx4], {z0.h - z3.h}  // 11000001-10100101-01011100-00000001
 // CHECK-INST: fadd    za.h[w10, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x01,0x5c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55c01 <unknown>
 
 fadd    za.h[w10, 1], {z0.h - z3.h}  // 11000001-10100101-01011100-00000001
 // CHECK-INST: fadd    za.h[w10, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x01,0x5c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55c01 <unknown>
 
 fadd    za.h[w8, 5, vgx4], {z20.h - z23.h}  // 11000001-10100101-00011110-10000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx4], { z20.h - z23.h }
 // CHECK-ENCODING: [0x85,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51e85 <unknown>
 
 fadd    za.h[w8, 5], {z20.h - z23.h}  // 11000001-10100101-00011110-10000101
 // CHECK-INST: fadd    za.h[w8, 5, vgx4], { z20.h - z23.h }
 // CHECK-ENCODING: [0x85,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51e85 <unknown>
 
 fadd    za.h[w11, 2, vgx4], {z8.h - z11.h}  // 11000001-10100101-01111101-00000010
 // CHECK-INST: fadd    za.h[w11, 2, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x02,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57d02 <unknown>
 
 fadd    za.h[w11, 2], {z8.h - z11.h}  // 11000001-10100101-01111101-00000010
 // CHECK-INST: fadd    za.h[w11, 2, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x02,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57d02 <unknown>
 
 fadd    za.h[w9, 7, vgx4], {z12.h - z15.h}  // 11000001-10100101-00111101-10000111
 // CHECK-INST: fadd    za.h[w9, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x87,0x3d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a53d87 <unknown>
 
 fadd    za.h[w9, 7], {z12.h - z15.h}  // 11000001-10100101-00111101-10000111
 // CHECK-INST: fadd    za.h[w9, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x87,0x3d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a53d87 <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/fcvt.s b/llvm/test/MC/AArch64/SME2p1/fcvt.s
index b5707bad0a24..2731055dedec 100644
--- a/llvm/test/MC/AArch64/SME2p1/fcvt.s
+++ b/llvm/test/MC/AArch64/SME2p1/fcvt.s
@@ -1,36 +1,36 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fcvt    {z0.s, z1.s}, z0.h  // 11000001-10100000-11100000-00000000
 // CHECK-INST: fcvt    { z0.s, z1.s }, z0.h
 // CHECK-ENCODING: [0x00,0xe0,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e000 <unknown>
 
 fcvt    {z20.s, z21.s}, z10.h  // 11000001-10100000-11100001-01010100
 // CHECK-INST: fcvt    { z20.s, z21.s }, z10.h
 // CHECK-ENCODING: [0x54,0xe1,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e154 <unknown>
 
 fcvt    {z22.s, z23.s}, z13.h  // 11000001-10100000-11100001-10110110
 // CHECK-INST: fcvt    { z22.s, z23.s }, z13.h
 // CHECK-ENCODING: [0xb6,0xe1,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e1b6 <unknown>
 
 fcvt    {z30.s, z31.s}, z31.h  // 11000001-10100000-11100011-11111110
 // CHECK-INST: fcvt    { z30.s, z31.s }, z31.h
 // CHECK-ENCODING: [0xfe,0xe3,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e3fe <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/fcvtl-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fcvtl-diagnostics.s
index a723d2fc6f3a..ad3eaba7bdc2 100644
--- a/llvm/test/MC/AArch64/SME2p1/fcvtl-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fcvtl-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid vector list
diff --git a/llvm/test/MC/AArch64/SME2p1/fcvtl.s b/llvm/test/MC/AArch64/SME2p1/fcvtl.s
index 31cf90d03796..6284915e4983 100644
--- a/llvm/test/MC/AArch64/SME2p1/fcvtl.s
+++ b/llvm/test/MC/AArch64/SME2p1/fcvtl.s
@@ -1,36 +1,36 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fcvtl   {z0.s, z1.s}, z0.h  // 11000001-10100000-11100000-00000001
 // CHECK-INST: fcvtl   { z0.s, z1.s }, z0.h
 // CHECK-ENCODING: [0x01,0xe0,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e001 <unknown>
 
 fcvtl   {z20.s, z21.s}, z10.h  // 11000001-10100000-11100001-01010101
 // CHECK-INST: fcvtl   { z20.s, z21.s }, z10.h
 // CHECK-ENCODING: [0x55,0xe1,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e155 <unknown>
 
 fcvtl   {z22.s, z23.s}, z13.h  // 11000001-10100000-11100001-10110111
 // CHECK-INST: fcvtl   { z22.s, z23.s }, z13.h
 // CHECK-ENCODING: [0xb7,0xe1,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e1b7 <unknown>
 
 fcvtl   {z30.s, z31.s}, z31.h  // 11000001-10100000-11100011-11111111
 // CHECK-INST: fcvtl   { z30.s, z31.s }, z31.h
 // CHECK-ENCODING: [0xff,0xe3,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0e3ff <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s
index d32f795728a2..2f0dccb57c90 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmla-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid vector list
diff --git a/llvm/test/MC/AArch64/SME2p1/fmla.s b/llvm/test/MC/AArch64/SME2p1/fmla.s
index 10529d81eed6..df9ac8076e56 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmla.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmla.s
@@ -1,877 +1,877 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fmla    za.h[w8, 0, vgx2], {z0.h, z1.h}, z0.h  // 11000001-00100000-00011100-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h
 // CHECK-ENCODING: [0x00,0x1c,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201c00 <unknown>
 
 fmla    za.h[w8, 0], {z0.h - z1.h}, z0.h  // 11000001-00100000-00011100-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h
 // CHECK-ENCODING: [0x00,0x1c,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201c00 <unknown>
 
 fmla    za.h[w10, 5, vgx2], {z10.h, z11.h}, z5.h  // 11000001-00100101-01011101-01000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h
 // CHECK-ENCODING: [0x45,0x5d,0x25,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1255d45 <unknown>
 
 fmla    za.h[w10, 5], {z10.h - z11.h}, z5.h  // 11000001-00100101-01011101-01000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h
 // CHECK-ENCODING: [0x45,0x5d,0x25,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1255d45 <unknown>
 
 fmla    za.h[w11, 7, vgx2], {z13.h, z14.h}, z8.h  // 11000001-00101000-01111101-10100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z13.h, z14.h }, z8.h
 // CHECK-ENCODING: [0xa7,0x7d,0x28,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1287da7 <unknown>
 
 fmla    za.h[w11, 7], {z13.h - z14.h}, z8.h  // 11000001-00101000-01111101-10100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z13.h, z14.h }, z8.h
 // CHECK-ENCODING: [0xa7,0x7d,0x28,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1287da7 <unknown>
 
 fmla    za.h[w11, 7, vgx2], {z31.h, z0.h}, z15.h  // 11000001-00101111-01111111-11100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z31.h, z0.h }, z15.h
 // CHECK-ENCODING: [0xe7,0x7f,0x2f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12f7fe7 <unknown>
 
 fmla    za.h[w11, 7], {z31.h - z0.h}, z15.h  // 11000001-00101111-01111111-11100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z31.h, z0.h }, z15.h
 // CHECK-ENCODING: [0xe7,0x7f,0x2f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12f7fe7 <unknown>
 
 fmla    za.h[w8, 5, vgx2], {z17.h, z18.h}, z0.h  // 11000001-00100000-00011110-00100101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z17.h, z18.h }, z0.h
 // CHECK-ENCODING: [0x25,0x1e,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201e25 <unknown>
 
 fmla    za.h[w8, 5], {z17.h - z18.h}, z0.h  // 11000001-00100000-00011110-00100101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z17.h, z18.h }, z0.h
 // CHECK-ENCODING: [0x25,0x1e,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201e25 <unknown>
 
 fmla    za.h[w8, 1, vgx2], {z1.h, z2.h}, z14.h  // 11000001-00101110-00011100-00100001
 // CHECK-INST: fmla    za.h[w8, 1, vgx2], { z1.h, z2.h }, z14.h
 // CHECK-ENCODING: [0x21,0x1c,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1c21 <unknown>
 
 fmla    za.h[w8, 1], {z1.h - z2.h}, z14.h  // 11000001-00101110-00011100-00100001
 // CHECK-INST: fmla    za.h[w8, 1, vgx2], { z1.h, z2.h }, z14.h
 // CHECK-ENCODING: [0x21,0x1c,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1c21 <unknown>
 
 fmla    za.h[w10, 0, vgx2], {z19.h, z20.h}, z4.h  // 11000001-00100100-01011110-01100000
 // CHECK-INST: fmla    za.h[w10, 0, vgx2], { z19.h, z20.h }, z4.h
 // CHECK-ENCODING: [0x60,0x5e,0x24,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1245e60 <unknown>
 
 fmla    za.h[w10, 0], {z19.h - z20.h}, z4.h  // 11000001-00100100-01011110-01100000
 // CHECK-INST: fmla    za.h[w10, 0, vgx2], { z19.h, z20.h }, z4.h
 // CHECK-ENCODING: [0x60,0x5e,0x24,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1245e60 <unknown>
 
 fmla    za.h[w8, 0, vgx2], {z12.h, z13.h}, z2.h  // 11000001-00100010-00011101-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h
 // CHECK-ENCODING: [0x80,0x1d,0x22,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1221d80 <unknown>
 
 fmla    za.h[w8, 0], {z12.h - z13.h}, z2.h  // 11000001-00100010-00011101-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h
 // CHECK-ENCODING: [0x80,0x1d,0x22,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1221d80 <unknown>
 
 fmla    za.h[w10, 1, vgx2], {z1.h, z2.h}, z10.h  // 11000001-00101010-01011100-00100001
 // CHECK-INST: fmla    za.h[w10, 1, vgx2], { z1.h, z2.h }, z10.h
 // CHECK-ENCODING: [0x21,0x5c,0x2a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12a5c21 <unknown>
 
 fmla    za.h[w10, 1], {z1.h - z2.h}, z10.h  // 11000001-00101010-01011100-00100001
 // CHECK-INST: fmla    za.h[w10, 1, vgx2], { z1.h, z2.h }, z10.h
 // CHECK-ENCODING: [0x21,0x5c,0x2a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12a5c21 <unknown>
 
 fmla    za.h[w8, 5, vgx2], {z22.h, z23.h}, z14.h  // 11000001-00101110-00011110-11000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h
 // CHECK-ENCODING: [0xc5,0x1e,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1ec5 <unknown>
 
 fmla    za.h[w8, 5], {z22.h - z23.h}, z14.h  // 11000001-00101110-00011110-11000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h
 // CHECK-ENCODING: [0xc5,0x1e,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1ec5 <unknown>
 
 fmla    za.h[w11, 2, vgx2], {z9.h, z10.h}, z1.h  // 11000001-00100001-01111101-00100010
 // CHECK-INST: fmla    za.h[w11, 2, vgx2], { z9.h, z10.h }, z1.h
 // CHECK-ENCODING: [0x22,0x7d,0x21,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1217d22 <unknown>
 
 fmla    za.h[w11, 2], {z9.h - z10.h}, z1.h  // 11000001-00100001-01111101-00100010
 // CHECK-INST: fmla    za.h[w11, 2, vgx2], { z9.h, z10.h }, z1.h
 // CHECK-ENCODING: [0x22,0x7d,0x21,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1217d22 <unknown>
 
 fmla    za.h[w9, 7, vgx2], {z12.h, z13.h}, z11.h  // 11000001-00101011-00111101-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h
 // CHECK-ENCODING: [0x87,0x3d,0x2b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12b3d87 <unknown>
 
 fmla    za.h[w9, 7], {z12.h - z13.h}, z11.h  // 11000001-00101011-00111101-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h
 // CHECK-ENCODING: [0x87,0x3d,0x2b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12b3d87 <unknown>
 
 fmla    za.h[w8, 0, vgx2], {z0.h, z1.h}, z0.h[0]  // 11000001-00010000-00010000-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
 // CHECK-ENCODING: [0x00,0x10,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101000 <unknown>
 
 fmla    za.h[w8, 0], {z0.h - z1.h}, z0.h[0]  // 11000001-00010000-00010000-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
 // CHECK-ENCODING: [0x00,0x10,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101000 <unknown>
 
 fmla    za.h[w10, 5, vgx2], {z10.h, z11.h}, z5.h[2]  // 11000001-00010101-01010101-01000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x45,0x55,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1155545 <unknown>
 
 fmla    za.h[w10, 5], {z10.h - z11.h}, z5.h[2]  // 11000001-00010101-01010101-01000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x45,0x55,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1155545 <unknown>
 
 fmla    za.h[w11, 7, vgx2], {z12.h, z13.h}, z8.h[6]  // 11000001-00011000-01111101-10000111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z12.h, z13.h }, z8.h[6]
 // CHECK-ENCODING: [0x87,0x7d,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1187d87 <unknown>
 
 fmla    za.h[w11, 7], {z12.h - z13.h}, z8.h[6]  // 11000001-00011000-01111101-10000111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z12.h, z13.h }, z8.h[6]
 // CHECK-ENCODING: [0x87,0x7d,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1187d87 <unknown>
 
 fmla    za.h[w11, 7, vgx2], {z30.h, z31.h}, z15.h[7]  // 11000001-00011111-01111111-11001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z30.h, z31.h }, z15.h[7]
 // CHECK-ENCODING: [0xcf,0x7f,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11f7fcf <unknown>
 
 fmla    za.h[w11, 7], {z30.h - z31.h}, z15.h[7]  // 11000001-00011111-01111111-11001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z30.h, z31.h }, z15.h[7]
 // CHECK-ENCODING: [0xcf,0x7f,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11f7fcf <unknown>
 
 fmla    za.h[w8, 5, vgx2], {z16.h, z17.h}, z0.h[6]  // 11000001-00010000-00011110-00000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z16.h, z17.h }, z0.h[6]
 // CHECK-ENCODING: [0x05,0x1e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101e05 <unknown>
 
 fmla    za.h[w8, 5], {z16.h - z17.h}, z0.h[6]  // 11000001-00010000-00011110-00000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z16.h, z17.h }, z0.h[6]
 // CHECK-ENCODING: [0x05,0x1e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101e05 <unknown>
 
 fmla    za.h[w8, 1, vgx2], {z0.h, z1.h}, z14.h[2]  // 11000001-00011110-00010100-00000001
 // CHECK-INST: fmla    za.h[w8, 1, vgx2], { z0.h, z1.h }, z14.h[2]
 // CHECK-ENCODING: [0x01,0x14,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1401 <unknown>
 
 fmla    za.h[w8, 1], {z0.h - z1.h}, z14.h[2]  // 11000001-00011110-00010100-00000001
 // CHECK-INST: fmla    za.h[w8, 1, vgx2], { z0.h, z1.h }, z14.h[2]
 // CHECK-ENCODING: [0x01,0x14,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1401 <unknown>
 
 fmla    za.h[w10, 0, vgx2], {z18.h, z19.h}, z4.h[3]  // 11000001-00010100-01010110-01001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx2], { z18.h, z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x48,0x56,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1145648 <unknown>
 
 fmla    za.h[w10, 0], {z18.h - z19.h}, z4.h[3]  // 11000001-00010100-01010110-01001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx2], { z18.h, z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x48,0x56,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1145648 <unknown>
 
 fmla    za.h[w8, 0, vgx2], {z12.h, z13.h}, z2.h[4]  // 11000001-00010010-00011001-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h[4]
 // CHECK-ENCODING: [0x80,0x19,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1121980 <unknown>
 
 fmla    za.h[w8, 0], {z12.h - z13.h}, z2.h[4]  // 11000001-00010010-00011001-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h[4]
 // CHECK-ENCODING: [0x80,0x19,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1121980 <unknown>
 
 fmla    za.h[w10, 1, vgx2], {z0.h, z1.h}, z10.h[4]  // 11000001-00011010-01011000-00000001
 // CHECK-INST: fmla    za.h[w10, 1, vgx2], { z0.h, z1.h }, z10.h[4]
 // CHECK-ENCODING: [0x01,0x58,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11a5801 <unknown>
 
 fmla    za.h[w10, 1], {z0.h - z1.h}, z10.h[4]  // 11000001-00011010-01011000-00000001
 // CHECK-INST: fmla    za.h[w10, 1, vgx2], { z0.h, z1.h }, z10.h[4]
 // CHECK-ENCODING: [0x01,0x58,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11a5801 <unknown>
 
 fmla    za.h[w8, 5, vgx2], {z22.h, z23.h}, z14.h[5]  // 11000001-00011110-00011010-11001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h[5]
 // CHECK-ENCODING: [0xcd,0x1a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1acd <unknown>
 
 fmla    za.h[w8, 5], {z22.h - z23.h}, z14.h[5]  // 11000001-00011110-00011010-11001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h[5]
 // CHECK-ENCODING: [0xcd,0x1a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1acd <unknown>
 
 fmla    za.h[w11, 2, vgx2], {z8.h, z9.h}, z1.h[2]  // 11000001-00010001-01110101-00000010
 // CHECK-INST: fmla    za.h[w11, 2, vgx2], { z8.h, z9.h }, z1.h[2]
 // CHECK-ENCODING: [0x02,0x75,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1117502 <unknown>
 
 fmla    za.h[w11, 2], {z8.h - z9.h}, z1.h[2]  // 11000001-00010001-01110101-00000010
 // CHECK-INST: fmla    za.h[w11, 2, vgx2], { z8.h, z9.h }, z1.h[2]
 // CHECK-ENCODING: [0x02,0x75,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1117502 <unknown>
 
 fmla    za.h[w9, 7, vgx2], {z12.h, z13.h}, z11.h[4]  // 11000001-00011011-00111001-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h[4]
 // CHECK-ENCODING: [0x87,0x39,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11b3987 <unknown>
 
 fmla    za.h[w9, 7], {z12.h - z13.h}, z11.h[4]  // 11000001-00011011-00111001-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h[4]
 // CHECK-ENCODING: [0x87,0x39,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11b3987 <unknown>
 
 fmla    za.h[w8, 0, vgx2], {z0.h, z1.h}, {z0.h, z1.h}  // 11000001-10100000-00010000-00001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z0.h, z1.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x08,0x10,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a01008 <unknown>
 
 fmla    za.h[w8, 0], {z0.h - z1.h}, {z0.h - z1.h}  // 11000001-10100000-00010000-00001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z0.h, z1.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x08,0x10,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a01008 <unknown>
 
 fmla    za.h[w10, 5, vgx2], {z10.h, z11.h}, {z20.h, z21.h}  // 11000001-10110100-01010001-01001101
 // CHECK-INST: fmla    za.h[w10, 5, vgx2], { z10.h, z11.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x4d,0x51,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b4514d <unknown>
 
 fmla    za.h[w10, 5], {z10.h - z11.h}, {z20.h - z21.h}  // 11000001-10110100-01010001-01001101
 // CHECK-INST: fmla    za.h[w10, 5, vgx2], { z10.h, z11.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x4d,0x51,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b4514d <unknown>
 
 fmla    za.h[w11, 7, vgx2], {z12.h, z13.h}, {z8.h, z9.h}  // 11000001-10101000-01110001-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z12.h, z13.h }, { z8.h, z9.h }
 // CHECK-ENCODING: [0x8f,0x71,0xa8,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a8718f <unknown>
 
 fmla    za.h[w11, 7], {z12.h - z13.h}, {z8.h - z9.h}  // 11000001-10101000-01110001-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z12.h, z13.h }, { z8.h, z9.h }
 // CHECK-ENCODING: [0x8f,0x71,0xa8,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a8718f <unknown>
 
 fmla    za.h[w11, 7, vgx2], {z30.h, z31.h}, {z30.h, z31.h}  // 11000001-10111110-01110011-11001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z30.h, z31.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xcf,0x73,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be73cf <unknown>
 
 fmla    za.h[w11, 7], {z30.h - z31.h}, {z30.h - z31.h}  // 11000001-10111110-01110011-11001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx2], { z30.h, z31.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xcf,0x73,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be73cf <unknown>
 
 fmla    za.h[w8, 5, vgx2], {z16.h, z17.h}, {z16.h, z17.h}  // 11000001-10110000-00010010-00001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z16.h, z17.h }, { z16.h, z17.h }
 // CHECK-ENCODING: [0x0d,0x12,0xb0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b0120d <unknown>
 
 fmla    za.h[w8, 5], {z16.h - z17.h}, {z16.h - z17.h}  // 11000001-10110000-00010010-00001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z16.h, z17.h }, { z16.h, z17.h }
 // CHECK-ENCODING: [0x0d,0x12,0xb0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b0120d <unknown>
 
 fmla    za.h[w8, 1, vgx2], {z0.h, z1.h}, {z30.h, z31.h}  // 11000001-10111110-00010000-00001001
 // CHECK-INST: fmla    za.h[w8, 1, vgx2], { z0.h, z1.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0x09,0x10,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be1009 <unknown>
 
 fmla    za.h[w8, 1], {z0.h - z1.h}, {z30.h - z31.h}  // 11000001-10111110-00010000-00001001
 // CHECK-INST: fmla    za.h[w8, 1, vgx2], { z0.h, z1.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0x09,0x10,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be1009 <unknown>
 
 fmla    za.h[w10, 0, vgx2], {z18.h, z19.h}, {z20.h, z21.h}  // 11000001-10110100-01010010-01001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx2], { z18.h, z19.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x48,0x52,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b45248 <unknown>
 
 fmla    za.h[w10, 0], {z18.h - z19.h}, {z20.h - z21.h}  // 11000001-10110100-01010010-01001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx2], { z18.h, z19.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x48,0x52,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b45248 <unknown>
 
 fmla    za.h[w8, 0, vgx2], {z12.h, z13.h}, {z2.h, z3.h}  // 11000001-10100010-00010001-10001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z12.h, z13.h }, { z2.h, z3.h }
 // CHECK-ENCODING: [0x88,0x11,0xa2,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a21188 <unknown>
 
 fmla    za.h[w8, 0], {z12.h - z13.h}, {z2.h - z3.h}  // 11000001-10100010-00010001-10001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx2], { z12.h, z13.h }, { z2.h, z3.h }
 // CHECK-ENCODING: [0x88,0x11,0xa2,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a21188 <unknown>
 
 fmla    za.h[w10, 1, vgx2], {z0.h, z1.h}, {z26.h, z27.h}  // 11000001-10111010-01010000-00001001
 // CHECK-INST: fmla    za.h[w10, 1, vgx2], { z0.h, z1.h }, { z26.h, z27.h }
 // CHECK-ENCODING: [0x09,0x50,0xba,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1ba5009 <unknown>
 
 fmla    za.h[w10, 1], {z0.h - z1.h}, {z26.h - z27.h}  // 11000001-10111010-01010000-00001001
 // CHECK-INST: fmla    za.h[w10, 1, vgx2], { z0.h, z1.h }, { z26.h, z27.h }
 // CHECK-ENCODING: [0x09,0x50,0xba,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1ba5009 <unknown>
 
 fmla    za.h[w8, 5, vgx2], {z22.h, z23.h}, {z30.h, z31.h}  // 11000001-10111110-00010010-11001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z22.h, z23.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xcd,0x12,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be12cd <unknown>
 
 fmla    za.h[w8, 5], {z22.h - z23.h}, {z30.h - z31.h}  // 11000001-10111110-00010010-11001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx2], { z22.h, z23.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xcd,0x12,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be12cd <unknown>
 
 fmla    za.h[w11, 2, vgx2], {z8.h, z9.h}, {z0.h, z1.h}  // 11000001-10100000-01110001-00001010
 // CHECK-INST: fmla    za.h[w11, 2, vgx2], { z8.h, z9.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x0a,0x71,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0710a <unknown>
 
 fmla    za.h[w11, 2], {z8.h - z9.h}, {z0.h - z1.h}  // 11000001-10100000-01110001-00001010
 // CHECK-INST: fmla    za.h[w11, 2, vgx2], { z8.h, z9.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x0a,0x71,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0710a <unknown>
 
 fmla    za.h[w9, 7, vgx2], {z12.h, z13.h}, {z10.h, z11.h}  // 11000001-10101010-00110001-10001111
 // CHECK-INST: fmla    za.h[w9, 7, vgx2], { z12.h, z13.h }, { z10.h, z11.h }
 // CHECK-ENCODING: [0x8f,0x31,0xaa,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1aa318f <unknown>
 
 fmla    za.h[w9, 7], {z12.h - z13.h}, {z10.h - z11.h}  // 11000001-10101010-00110001-10001111
 // CHECK-INST: fmla    za.h[w9, 7, vgx2], { z12.h, z13.h }, { z10.h, z11.h }
 // CHECK-ENCODING: [0x8f,0x31,0xaa,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1aa318f <unknown>
 
 
 fmla    za.h[w8, 0, vgx4], {z0.h - z3.h}, z0.h  // 11000001-00110000-00011100-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h
 // CHECK-ENCODING: [0x00,0x1c,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301c00 <unknown>
 
 fmla    za.h[w8, 0], {z0.h - z3.h}, z0.h  // 11000001-00110000-00011100-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h
 // CHECK-ENCODING: [0x00,0x1c,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301c00 <unknown>
 
 fmla    za.h[w10, 5, vgx4], {z10.h - z13.h}, z5.h  // 11000001-00110101-01011101-01000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx4], { z10.h - z13.h }, z5.h
 // CHECK-ENCODING: [0x45,0x5d,0x35,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1355d45 <unknown>
 
 fmla    za.h[w10, 5], {z10.h - z13.h}, z5.h  // 11000001-00110101-01011101-01000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx4], { z10.h - z13.h }, z5.h
 // CHECK-ENCODING: [0x45,0x5d,0x35,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1355d45 <unknown>
 
 fmla    za.h[w11, 7, vgx4], {z13.h - z16.h}, z8.h  // 11000001-00111000-01111101-10100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z13.h - z16.h }, z8.h
 // CHECK-ENCODING: [0xa7,0x7d,0x38,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1387da7 <unknown>
 
 fmla    za.h[w11, 7], {z13.h - z16.h}, z8.h  // 11000001-00111000-01111101-10100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z13.h - z16.h }, z8.h
 // CHECK-ENCODING: [0xa7,0x7d,0x38,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1387da7 <unknown>
 
 fmla    za.h[w11, 7, vgx4], {z31.h, z0.h, z1.h, z2.h}, z15.h  // 11000001-00111111-01111111-11100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z31.h, z0.h, z1.h, z2.h }, z15.h
 // CHECK-ENCODING: [0xe7,0x7f,0x3f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13f7fe7 <unknown>
 
 fmla    za.h[w11, 7], {z31.h, z0.h, z1.h, z2.h}, z15.h  // 11000001-00111111-01111111-11100111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z31.h, z0.h, z1.h, z2.h }, z15.h
 // CHECK-ENCODING: [0xe7,0x7f,0x3f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13f7fe7 <unknown>
 
 fmla    za.h[w8, 5, vgx4], {z17.h - z20.h}, z0.h  // 11000001-00110000-00011110-00100101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z17.h - z20.h }, z0.h
 // CHECK-ENCODING: [0x25,0x1e,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301e25 <unknown>
 
 fmla    za.h[w8, 5], {z17.h - z20.h}, z0.h  // 11000001-00110000-00011110-00100101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z17.h - z20.h }, z0.h
 // CHECK-ENCODING: [0x25,0x1e,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301e25 <unknown>
 
 fmla    za.h[w8, 1, vgx4], {z1.h - z4.h}, z14.h  // 11000001-00111110-00011100-00100001
 // CHECK-INST: fmla    za.h[w8, 1, vgx4], { z1.h - z4.h }, z14.h
 // CHECK-ENCODING: [0x21,0x1c,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1c21 <unknown>
 
 fmla    za.h[w8, 1], {z1.h - z4.h}, z14.h  // 11000001-00111110-00011100-00100001
 // CHECK-INST: fmla    za.h[w8, 1, vgx4], { z1.h - z4.h }, z14.h
 // CHECK-ENCODING: [0x21,0x1c,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1c21 <unknown>
 
 fmla    za.h[w10, 0, vgx4], {z19.h - z22.h}, z4.h  // 11000001-00110100-01011110-01100000
 // CHECK-INST: fmla    za.h[w10, 0, vgx4], { z19.h - z22.h }, z4.h
 // CHECK-ENCODING: [0x60,0x5e,0x34,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1345e60 <unknown>
 
 fmla    za.h[w10, 0], {z19.h - z22.h}, z4.h  // 11000001-00110100-01011110-01100000
 // CHECK-INST: fmla    za.h[w10, 0, vgx4], { z19.h - z22.h }, z4.h
 // CHECK-ENCODING: [0x60,0x5e,0x34,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1345e60 <unknown>
 
 fmla    za.h[w8, 0, vgx4], {z12.h - z15.h}, z2.h  // 11000001-00110010-00011101-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h
 // CHECK-ENCODING: [0x80,0x1d,0x32,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1321d80 <unknown>
 
 fmla    za.h[w8, 0], {z12.h - z15.h}, z2.h  // 11000001-00110010-00011101-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h
 // CHECK-ENCODING: [0x80,0x1d,0x32,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1321d80 <unknown>
 
 fmla    za.h[w10, 1, vgx4], {z1.h - z4.h}, z10.h  // 11000001-00111010-01011100-00100001
 // CHECK-INST: fmla    za.h[w10, 1, vgx4], { z1.h - z4.h }, z10.h
 // CHECK-ENCODING: [0x21,0x5c,0x3a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13a5c21 <unknown>
 
 fmla    za.h[w10, 1], {z1.h - z4.h}, z10.h  // 11000001-00111010-01011100-00100001
 // CHECK-INST: fmla    za.h[w10, 1, vgx4], { z1.h - z4.h }, z10.h
 // CHECK-ENCODING: [0x21,0x5c,0x3a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13a5c21 <unknown>
 
 fmla    za.h[w8, 5, vgx4], {z22.h - z25.h}, z14.h  // 11000001-00111110-00011110-11000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z22.h - z25.h }, z14.h
 // CHECK-ENCODING: [0xc5,0x1e,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1ec5 <unknown>
 
 fmla    za.h[w8, 5], {z22.h - z25.h}, z14.h  // 11000001-00111110-00011110-11000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z22.h - z25.h }, z14.h
 // CHECK-ENCODING: [0xc5,0x1e,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1ec5 <unknown>
 
 fmla    za.h[w11, 2, vgx4], {z9.h - z12.h}, z1.h  // 11000001-00110001-01111101-00100010
 // CHECK-INST: fmla    za.h[w11, 2, vgx4], { z9.h - z12.h }, z1.h
 // CHECK-ENCODING: [0x22,0x7d,0x31,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1317d22 <unknown>
 
 fmla    za.h[w11, 2], {z9.h - z12.h}, z1.h  // 11000001-00110001-01111101-00100010
 // CHECK-INST: fmla    za.h[w11, 2, vgx4], { z9.h - z12.h }, z1.h
 // CHECK-ENCODING: [0x22,0x7d,0x31,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1317d22 <unknown>
 
 fmla    za.h[w9, 7, vgx4], {z12.h - z15.h}, z11.h  // 11000001-00111011-00111101-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h
 // CHECK-ENCODING: [0x87,0x3d,0x3b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13b3d87 <unknown>
 
 fmla    za.h[w9, 7], {z12.h - z15.h}, z11.h  // 11000001-00111011-00111101-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h
 // CHECK-ENCODING: [0x87,0x3d,0x3b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13b3d87 <unknown>
 
 fmla    za.h[w8, 0, vgx4], {z0.h - z3.h}, z0.h[0]  // 11000001-00010000-10010000-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h[0]
 // CHECK-ENCODING: [0x00,0x90,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109000 <unknown>
 
 fmla    za.h[w8, 0], {z0.h - z3.h}, z0.h[0]  // 11000001-00010000-10010000-00000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h[0]
 // CHECK-ENCODING: [0x00,0x90,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109000 <unknown>
 
 fmla    za.h[w10, 5, vgx4], {z8.h - z11.h}, z5.h[2]  // 11000001-00010101-11010101-00000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx4], { z8.h - z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x05,0xd5,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c115d505 <unknown>
 
 fmla    za.h[w10, 5], {z8.h - z11.h}, z5.h[2]  // 11000001-00010101-11010101-00000101
 // CHECK-INST: fmla    za.h[w10, 5, vgx4], { z8.h - z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x05,0xd5,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c115d505 <unknown>
 
 fmla    za.h[w11, 7, vgx4], {z12.h - z15.h}, z8.h[6]  // 11000001-00011000-11111101-10000111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z12.h - z15.h }, z8.h[6]
 // CHECK-ENCODING: [0x87,0xfd,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c118fd87 <unknown>
 
 fmla    za.h[w11, 7], {z12.h - z15.h}, z8.h[6]  // 11000001-00011000-11111101-10000111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z12.h - z15.h }, z8.h[6]
 // CHECK-ENCODING: [0x87,0xfd,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c118fd87 <unknown>
 
 fmla    za.h[w11, 7, vgx4], {z28.h - z31.h}, z15.h[7]  // 11000001-00011111-11111111-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z28.h - z31.h }, z15.h[7]
 // CHECK-ENCODING: [0x8f,0xff,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11fff8f <unknown>
 
 fmla    za.h[w11, 7], {z28.h - z31.h}, z15.h[7]  // 11000001-00011111-11111111-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z28.h - z31.h }, z15.h[7]
 // CHECK-ENCODING: [0x8f,0xff,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11fff8f <unknown>
 
 fmla    za.h[w8, 5, vgx4], {z16.h - z19.h}, z0.h[6]  // 11000001-00010000-10011110-00000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z16.h - z19.h }, z0.h[6]
 // CHECK-ENCODING: [0x05,0x9e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109e05 <unknown>
 
 fmla    za.h[w8, 5], {z16.h - z19.h}, z0.h[6]  // 11000001-00010000-10011110-00000101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z16.h - z19.h }, z0.h[6]
 // CHECK-ENCODING: [0x05,0x9e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109e05 <unknown>
 
 fmla    za.h[w8, 1, vgx4], {z0.h - z3.h}, z14.h[2]  // 11000001-00011110-10010100-00000001
 // CHECK-INST: fmla    za.h[w8, 1, vgx4], { z0.h - z3.h }, z14.h[2]
 // CHECK-ENCODING: [0x01,0x94,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9401 <unknown>
 
 fmla    za.h[w8, 1], {z0.h - z3.h}, z14.h[2]  // 11000001-00011110-10010100-00000001
 // CHECK-INST: fmla    za.h[w8, 1, vgx4], { z0.h - z3.h }, z14.h[2]
 // CHECK-ENCODING: [0x01,0x94,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9401 <unknown>
 
 fmla    za.h[w10, 0, vgx4], {z16.h - z19.h}, z4.h[3]  // 11000001-00010100-11010110-00001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx4], { z16.h - z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x08,0xd6,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c114d608 <unknown>
 
 fmla    za.h[w10, 0], {z16.h - z19.h}, z4.h[3]  // 11000001-00010100-11010110-00001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx4], { z16.h - z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x08,0xd6,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c114d608 <unknown>
 
 fmla    za.h[w8, 0, vgx4], {z12.h - z15.h}, z2.h[4]  // 11000001-00010010-10011001-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h[4]
 // CHECK-ENCODING: [0x80,0x99,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1129980 <unknown>
 
 fmla    za.h[w8, 0], {z12.h - z15.h}, z2.h[4]  // 11000001-00010010-10011001-10000000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h[4]
 // CHECK-ENCODING: [0x80,0x99,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1129980 <unknown>
 
 fmla    za.h[w10, 1, vgx4], {z0.h - z3.h}, z10.h[4]  // 11000001-00011010-11011000-00000001
 // CHECK-INST: fmla    za.h[w10, 1, vgx4], { z0.h - z3.h }, z10.h[4]
 // CHECK-ENCODING: [0x01,0xd8,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11ad801 <unknown>
 
 fmla    za.h[w10, 1], {z0.h - z3.h}, z10.h[4]  // 11000001-00011010-11011000-00000001
 // CHECK-INST: fmla    za.h[w10, 1, vgx4], { z0.h - z3.h }, z10.h[4]
 // CHECK-ENCODING: [0x01,0xd8,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11ad801 <unknown>
 
 fmla    za.h[w8, 5, vgx4], {z20.h - z23.h}, z14.h[5]  // 11000001-00011110-10011010-10001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z20.h - z23.h }, z14.h[5]
 // CHECK-ENCODING: [0x8d,0x9a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9a8d <unknown>
 
 fmla    za.h[w8, 5], {z20.h - z23.h}, z14.h[5]  // 11000001-00011110-10011010-10001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z20.h - z23.h }, z14.h[5]
 // CHECK-ENCODING: [0x8d,0x9a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9a8d <unknown>
 
 fmla    za.h[w11, 2, vgx4], {z8.h - z11.h}, z1.h[2]  // 11000001-00010001-11110101-00000010
 // CHECK-INST: fmla    za.h[w11, 2, vgx4], { z8.h - z11.h }, z1.h[2]
 // CHECK-ENCODING: [0x02,0xf5,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c111f502 <unknown>
 
 fmla    za.h[w11, 2], {z8.h - z11.h}, z1.h[2]  // 11000001-00010001-11110101-00000010
 // CHECK-INST: fmla    za.h[w11, 2, vgx4], { z8.h - z11.h }, z1.h[2]
 // CHECK-ENCODING: [0x02,0xf5,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c111f502 <unknown>
 
 fmla    za.h[w9, 7, vgx4], {z12.h - z15.h}, z11.h[4]  // 11000001-00011011-10111001-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h[4]
 // CHECK-ENCODING: [0x87,0xb9,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11bb987 <unknown>
 
 fmla    za.h[w9, 7], {z12.h - z15.h}, z11.h[4]  // 11000001-00011011-10111001-10000111
 // CHECK-INST: fmla    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h[4]
 // CHECK-ENCODING: [0x87,0xb9,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11bb987 <unknown>
 
 fmla    za.h[w8, 0, vgx4], {z0.h - z3.h}, {z0.h - z3.h}  // 11000001-10100001-00010000-00001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z0.h - z3.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x08,0x10,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11008 <unknown>
 
 fmla    za.h[w8, 0], {z0.h - z3.h}, {z0.h - z3.h}  // 11000001-10100001-00010000-00001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z0.h - z3.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x08,0x10,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11008 <unknown>
 
 fmla    za.h[w10, 5, vgx4], {z8.h - z11.h}, {z20.h - z23.h}  // 11000001-10110101-01010001-00001101
 // CHECK-INST: fmla    za.h[w10, 5, vgx4], { z8.h - z11.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x0d,0x51,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b5510d <unknown>
 
 fmla    za.h[w10, 5], {z8.h - z11.h}, {z20.h - z23.h}  // 11000001-10110101-01010001-00001101
 // CHECK-INST: fmla    za.h[w10, 5, vgx4], { z8.h - z11.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x0d,0x51,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b5510d <unknown>
 
 fmla    za.h[w11, 7, vgx4], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-01110001-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x8f,0x71,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9718f <unknown>
 
 fmla    za.h[w11, 7], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-01110001-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x8f,0x71,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9718f <unknown>
 
 fmla    za.h[w11, 7, vgx4], {z28.h - z31.h}, {z28.h - z31.h}  // 11000001-10111101-01110011-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z28.h - z31.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x8f,0x73,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd738f <unknown>
 
 fmla    za.h[w11, 7], {z28.h - z31.h}, {z28.h - z31.h}  // 11000001-10111101-01110011-10001111
 // CHECK-INST: fmla    za.h[w11, 7, vgx4], { z28.h - z31.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x8f,0x73,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd738f <unknown>
 
 fmla    za.h[w8, 5, vgx4], {z16.h - z19.h}, {z16.h - z19.h}  // 11000001-10110001-00010010-00001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z16.h - z19.h }, { z16.h - z19.h }
 // CHECK-ENCODING: [0x0d,0x12,0xb1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b1120d <unknown>
 
 fmla    za.h[w8, 5], {z16.h - z19.h}, {z16.h - z19.h}  // 11000001-10110001-00010010-00001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z16.h - z19.h }, { z16.h - z19.h }
 // CHECK-ENCODING: [0x0d,0x12,0xb1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b1120d <unknown>
 
 fmla    za.h[w8, 1, vgx4], {z0.h - z3.h}, {z28.h - z31.h}  // 11000001-10111101-00010000-00001001
 // CHECK-INST: fmla    za.h[w8, 1, vgx4], { z0.h - z3.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x09,0x10,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd1009 <unknown>
 
 fmla    za.h[w8, 1], {z0.h - z3.h}, {z28.h - z31.h}  // 11000001-10111101-00010000-00001001
 // CHECK-INST: fmla    za.h[w8, 1, vgx4], { z0.h - z3.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x09,0x10,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd1009 <unknown>
 
 fmla    za.h[w10, 0, vgx4], {z16.h - z19.h}, {z20.h - z23.h}  // 11000001-10110101-01010010-00001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx4], { z16.h - z19.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x08,0x52,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b55208 <unknown>
 
 fmla    za.h[w10, 0], {z16.h - z19.h}, {z20.h - z23.h}  // 11000001-10110101-01010010-00001000
 // CHECK-INST: fmla    za.h[w10, 0, vgx4], { z16.h - z19.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x08,0x52,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b55208 <unknown>
 
 fmla    za.h[w8, 0, vgx4], {z12.h - z15.h}, {z0.h - z3.h}  // 11000001-10100001-00010001-10001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z12.h - z15.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x88,0x11,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11188 <unknown>
 
 fmla    za.h[w8, 0], {z12.h - z15.h}, {z0.h - z3.h}  // 11000001-10100001-00010001-10001000
 // CHECK-INST: fmla    za.h[w8, 0, vgx4], { z12.h - z15.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x88,0x11,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11188 <unknown>
 
 fmla    za.h[w10, 1, vgx4], {z0.h - z3.h}, {z24.h - z27.h}  // 11000001-10111001-01010000-00001001
 // CHECK-INST: fmla    za.h[w10, 1, vgx4], { z0.h - z3.h }, { z24.h - z27.h }
 // CHECK-ENCODING: [0x09,0x50,0xb9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b95009 <unknown>
 
 fmla    za.h[w10, 1], {z0.h - z3.h}, {z24.h - z27.h}  // 11000001-10111001-01010000-00001001
 // CHECK-INST: fmla    za.h[w10, 1, vgx4], { z0.h - z3.h }, { z24.h - z27.h }
 // CHECK-ENCODING: [0x09,0x50,0xb9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b95009 <unknown>
 
 fmla    za.h[w8, 5, vgx4], {z20.h - z23.h}, {z28.h - z31.h}  // 11000001-10111101-00010010-10001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z20.h - z23.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x8d,0x12,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd128d <unknown>
 
 fmla    za.h[w8, 5], {z20.h - z23.h}, {z28.h - z31.h}  // 11000001-10111101-00010010-10001101
 // CHECK-INST: fmla    za.h[w8, 5, vgx4], { z20.h - z23.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x8d,0x12,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd128d <unknown>
 
 fmla    za.h[w11, 2, vgx4], {z8.h - z11.h}, {z0.h - z3.h}  // 11000001-10100001-01110001-00001010
 // CHECK-INST: fmla    za.h[w11, 2, vgx4], { z8.h - z11.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x0a,0x71,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a1710a <unknown>
 
 fmla    za.h[w11, 2], {z8.h - z11.h}, {z0.h - z3.h}  // 11000001-10100001-01110001-00001010
 // CHECK-INST: fmla    za.h[w11, 2, vgx4], { z8.h - z11.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x0a,0x71,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a1710a <unknown>
 
 fmla    za.h[w9, 7, vgx4], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-00110001-10001111
 // CHECK-INST: fmla    za.h[w9, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x8f,0x31,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9318f <unknown>
 
 fmla    za.h[w9, 7], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-00110001-10001111
 // CHECK-INST: fmla    za.h[w9, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x8f,0x31,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9318f <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s
index 2174e4202ba0..3ff09321e343 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmls-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid vector list
diff --git a/llvm/test/MC/AArch64/SME2p1/fmls.s b/llvm/test/MC/AArch64/SME2p1/fmls.s
index 9bbb21869e37..67b1430240e8 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmls.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmls.s
@@ -1,878 +1,878 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d  --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fmls    za.h[w8, 0, vgx2], {z0.h, z1.h}, z0.h  // 11000001-00100000-00011100-00001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h
 // CHECK-ENCODING: [0x08,0x1c,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201c08 <unknown>
 
 fmls    za.h[w8, 0], {z0.h - z1.h}, z0.h  // 11000001-00100000-00011100-00001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h
 // CHECK-ENCODING: [0x08,0x1c,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201c08 <unknown>
 
 fmls    za.h[w10, 5, vgx2], {z10.h, z11.h}, z5.h  // 11000001-00100101-01011101-01001101
 // CHECK-INST: fmls    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h
 // CHECK-ENCODING: [0x4d,0x5d,0x25,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1255d4d <unknown>
 
 fmls    za.h[w10, 5], {z10.h - z11.h}, z5.h  // 11000001-00100101-01011101-01001101
 // CHECK-INST: fmls    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h
 // CHECK-ENCODING: [0x4d,0x5d,0x25,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1255d4d <unknown>
 
 fmls    za.h[w11, 7, vgx2], {z13.h, z14.h}, z8.h  // 11000001-00101000-01111101-10101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z13.h, z14.h }, z8.h
 // CHECK-ENCODING: [0xaf,0x7d,0x28,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1287daf <unknown>
 
 fmls    za.h[w11, 7], {z13.h - z14.h}, z8.h  // 11000001-00101000-01111101-10101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z13.h, z14.h }, z8.h
 // CHECK-ENCODING: [0xaf,0x7d,0x28,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1287daf <unknown>
 
 fmls    za.h[w11, 7, vgx2], {z31.h, z0.h}, z15.h  // 11000001-00101111-01111111-11101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z31.h, z0.h }, z15.h
 // CHECK-ENCODING: [0xef,0x7f,0x2f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12f7fef <unknown>
 
 fmls    za.h[w11, 7], {z31.h - z0.h}, z15.h  // 11000001-00101111-01111111-11101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z31.h, z0.h }, z15.h
 // CHECK-ENCODING: [0xef,0x7f,0x2f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12f7fef <unknown>
 
 fmls    za.h[w8, 5, vgx2], {z17.h, z18.h}, z0.h  // 11000001-00100000-00011110-00101101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z17.h, z18.h }, z0.h
 // CHECK-ENCODING: [0x2d,0x1e,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201e2d <unknown>
 
 fmls    za.h[w8, 5], {z17.h - z18.h}, z0.h  // 11000001-00100000-00011110-00101101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z17.h, z18.h }, z0.h
 // CHECK-ENCODING: [0x2d,0x1e,0x20,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1201e2d <unknown>
 
 fmls    za.h[w8, 1, vgx2], {z1.h, z2.h}, z14.h  // 11000001-00101110-00011100-00101001
 // CHECK-INST: fmls    za.h[w8, 1, vgx2], { z1.h, z2.h }, z14.h
 // CHECK-ENCODING: [0x29,0x1c,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1c29 <unknown>
 
 fmls    za.h[w8, 1], {z1.h - z2.h}, z14.h  // 11000001-00101110-00011100-00101001
 // CHECK-INST: fmls    za.h[w8, 1, vgx2], { z1.h, z2.h }, z14.h
 // CHECK-ENCODING: [0x29,0x1c,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1c29 <unknown>
 
 fmls    za.h[w10, 0, vgx2], {z19.h, z20.h}, z4.h  // 11000001-00100100-01011110-01101000
 // CHECK-INST: fmls    za.h[w10, 0, vgx2], { z19.h, z20.h }, z4.h
 // CHECK-ENCODING: [0x68,0x5e,0x24,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1245e68 <unknown>
 
 fmls    za.h[w10, 0], {z19.h - z20.h}, z4.h  // 11000001-00100100-01011110-01101000
 // CHECK-INST: fmls    za.h[w10, 0, vgx2], { z19.h, z20.h }, z4.h
 // CHECK-ENCODING: [0x68,0x5e,0x24,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1245e68 <unknown>
 
 fmls    za.h[w8, 0, vgx2], {z12.h, z13.h}, z2.h  // 11000001-00100010-00011101-10001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h
 // CHECK-ENCODING: [0x88,0x1d,0x22,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1221d88 <unknown>
 
 fmls    za.h[w8, 0], {z12.h - z13.h}, z2.h  // 11000001-00100010-00011101-10001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h
 // CHECK-ENCODING: [0x88,0x1d,0x22,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1221d88 <unknown>
 
 fmls    za.h[w10, 1, vgx2], {z1.h, z2.h}, z10.h  // 11000001-00101010-01011100-00101001
 // CHECK-INST: fmls    za.h[w10, 1, vgx2], { z1.h, z2.h }, z10.h
 // CHECK-ENCODING: [0x29,0x5c,0x2a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12a5c29 <unknown>
 
 fmls    za.h[w10, 1], {z1.h - z2.h}, z10.h  // 11000001-00101010-01011100-00101001
 // CHECK-INST: fmls    za.h[w10, 1, vgx2], { z1.h, z2.h }, z10.h
 // CHECK-ENCODING: [0x29,0x5c,0x2a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12a5c29 <unknown>
 
 fmls    za.h[w8, 5, vgx2], {z22.h, z23.h}, z14.h  // 11000001-00101110-00011110-11001101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h
 // CHECK-ENCODING: [0xcd,0x1e,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1ecd <unknown>
 
 fmls    za.h[w8, 5], {z22.h - z23.h}, z14.h  // 11000001-00101110-00011110-11001101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h
 // CHECK-ENCODING: [0xcd,0x1e,0x2e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12e1ecd <unknown>
 
 fmls    za.h[w11, 2, vgx2], {z9.h, z10.h}, z1.h  // 11000001-00100001-01111101-00101010
 // CHECK-INST: fmls    za.h[w11, 2, vgx2], { z9.h, z10.h }, z1.h
 // CHECK-ENCODING: [0x2a,0x7d,0x21,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1217d2a <unknown>
 
 fmls    za.h[w11, 2], {z9.h - z10.h}, z1.h  // 11000001-00100001-01111101-00101010
 // CHECK-INST: fmls    za.h[w11, 2, vgx2], { z9.h, z10.h }, z1.h
 // CHECK-ENCODING: [0x2a,0x7d,0x21,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1217d2a <unknown>
 
 fmls    za.h[w9, 7, vgx2], {z12.h, z13.h}, z11.h  // 11000001-00101011-00111101-10001111
 // CHECK-INST: fmls    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h
 // CHECK-ENCODING: [0x8f,0x3d,0x2b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12b3d8f <unknown>
 
 fmls    za.h[w9, 7], {z12.h - z13.h}, z11.h  // 11000001-00101011-00111101-10001111
 // CHECK-INST: fmls    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h
 // CHECK-ENCODING: [0x8f,0x3d,0x2b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c12b3d8f <unknown>
 
 
 fmls    za.h[w8, 0, vgx2], {z0.h, z1.h}, z0.h[0]  // 11000001-00010000-00010000-00010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
 // CHECK-ENCODING: [0x10,0x10,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101010 <unknown>
 
 fmls    za.h[w8, 0], {z0.h - z1.h}, z0.h[0]  // 11000001-00010000-00010000-00010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
 // CHECK-ENCODING: [0x10,0x10,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101010 <unknown>
 
 fmls    za.h[w10, 5, vgx2], {z10.h, z11.h}, z5.h[2]  // 11000001-00010101-01010101-01010101
 // CHECK-INST: fmls    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x55,0x55,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1155555 <unknown>
 
 fmls    za.h[w10, 5], {z10.h - z11.h}, z5.h[2]  // 11000001-00010101-01010101-01010101
 // CHECK-INST: fmls    za.h[w10, 5, vgx2], { z10.h, z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x55,0x55,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1155555 <unknown>
 
 fmls    za.h[w11, 7, vgx2], {z12.h, z13.h}, z8.h[6]  // 11000001-00011000-01111101-10010111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z12.h, z13.h }, z8.h[6]
 // CHECK-ENCODING: [0x97,0x7d,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1187d97 <unknown>
 
 fmls    za.h[w11, 7], {z12.h - z13.h}, z8.h[6]  // 11000001-00011000-01111101-10010111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z12.h, z13.h }, z8.h[6]
 // CHECK-ENCODING: [0x97,0x7d,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1187d97 <unknown>
 
 fmls    za.h[w11, 7, vgx2], {z30.h, z31.h}, z15.h[7]  // 11000001-00011111-01111111-11011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z30.h, z31.h }, z15.h[7]
 // CHECK-ENCODING: [0xdf,0x7f,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11f7fdf <unknown>
 
 fmls    za.h[w11, 7], {z30.h - z31.h}, z15.h[7]  // 11000001-00011111-01111111-11011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z30.h, z31.h }, z15.h[7]
 // CHECK-ENCODING: [0xdf,0x7f,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11f7fdf <unknown>
 
 fmls    za.h[w8, 5, vgx2], {z16.h, z17.h}, z0.h[6]  // 11000001-00010000-00011110-00010101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z16.h, z17.h }, z0.h[6]
 // CHECK-ENCODING: [0x15,0x1e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101e15 <unknown>
 
 fmls    za.h[w8, 5], {z16.h - z17.h}, z0.h[6]  // 11000001-00010000-00011110-00010101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z16.h, z17.h }, z0.h[6]
 // CHECK-ENCODING: [0x15,0x1e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1101e15 <unknown>
 
 fmls    za.h[w8, 1, vgx2], {z0.h, z1.h}, z14.h[2]  // 11000001-00011110-00010100-00010001
 // CHECK-INST: fmls    za.h[w8, 1, vgx2], { z0.h, z1.h }, z14.h[2]
 // CHECK-ENCODING: [0x11,0x14,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1411 <unknown>
 
 fmls    za.h[w8, 1], {z0.h - z1.h}, z14.h[2]  // 11000001-00011110-00010100-00010001
 // CHECK-INST: fmls    za.h[w8, 1, vgx2], { z0.h, z1.h }, z14.h[2]
 // CHECK-ENCODING: [0x11,0x14,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1411 <unknown>
 
 fmls    za.h[w10, 0, vgx2], {z18.h, z19.h}, z4.h[3]  // 11000001-00010100-01010110-01011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx2], { z18.h, z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x58,0x56,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1145658 <unknown>
 
 fmls    za.h[w10, 0], {z18.h - z19.h}, z4.h[3]  // 11000001-00010100-01010110-01011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx2], { z18.h, z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x58,0x56,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1145658 <unknown>
 
 fmls    za.h[w8, 0, vgx2], {z12.h, z13.h}, z2.h[4]  // 11000001-00010010-00011001-10010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h[4]
 // CHECK-ENCODING: [0x90,0x19,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1121990 <unknown>
 
 fmls    za.h[w8, 0], {z12.h - z13.h}, z2.h[4]  // 11000001-00010010-00011001-10010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z12.h, z13.h }, z2.h[4]
 // CHECK-ENCODING: [0x90,0x19,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1121990 <unknown>
 
 fmls    za.h[w10, 1, vgx2], {z0.h, z1.h}, z10.h[4]  // 11000001-00011010-01011000-00010001
 // CHECK-INST: fmls    za.h[w10, 1, vgx2], { z0.h, z1.h }, z10.h[4]
 // CHECK-ENCODING: [0x11,0x58,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11a5811 <unknown>
 
 fmls    za.h[w10, 1], {z0.h - z1.h}, z10.h[4]  // 11000001-00011010-01011000-00010001
 // CHECK-INST: fmls    za.h[w10, 1, vgx2], { z0.h, z1.h }, z10.h[4]
 // CHECK-ENCODING: [0x11,0x58,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11a5811 <unknown>
 
 fmls    za.h[w8, 5, vgx2], {z22.h, z23.h}, z14.h[5]  // 11000001-00011110-00011010-11011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h[5]
 // CHECK-ENCODING: [0xdd,0x1a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1add <unknown>
 
 fmls    za.h[w8, 5], {z22.h - z23.h}, z14.h[5]  // 11000001-00011110-00011010-11011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z22.h, z23.h }, z14.h[5]
 // CHECK-ENCODING: [0xdd,0x1a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e1add <unknown>
 
 fmls    za.h[w11, 2, vgx2], {z8.h, z9.h}, z1.h[2]  // 11000001-00010001-01110101-00010010
 // CHECK-INST: fmls    za.h[w11, 2, vgx2], { z8.h, z9.h }, z1.h[2]
 // CHECK-ENCODING: [0x12,0x75,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1117512 <unknown>
 
 fmls    za.h[w11, 2], {z8.h - z9.h}, z1.h[2]  // 11000001-00010001-01110101-00010010
 // CHECK-INST: fmls    za.h[w11, 2, vgx2], { z8.h, z9.h }, z1.h[2]
 // CHECK-ENCODING: [0x12,0x75,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1117512 <unknown>
 
 fmls    za.h[w9, 7, vgx2], {z12.h, z13.h}, z11.h[4]  // 11000001-00011011-00111001-10010111
 // CHECK-INST: fmls    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h[4]
 // CHECK-ENCODING: [0x97,0x39,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11b3997 <unknown>
 
 fmls    za.h[w9, 7], {z12.h - z13.h}, z11.h[4]  // 11000001-00011011-00111001-10010111
 // CHECK-INST: fmls    za.h[w9, 7, vgx2], { z12.h, z13.h }, z11.h[4]
 // CHECK-ENCODING: [0x97,0x39,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11b3997 <unknown>
 
 
 fmls    za.h[w8, 0, vgx2], {z0.h, z1.h}, {z0.h, z1.h}  // 11000001-10100000-00010000-00011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z0.h, z1.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x18,0x10,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a01018 <unknown>
 
 fmls    za.h[w8, 0], {z0.h - z1.h}, {z0.h - z1.h}  // 11000001-10100000-00010000-00011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z0.h, z1.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x18,0x10,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a01018 <unknown>
 
 fmls    za.h[w10, 5, vgx2], {z10.h, z11.h}, {z20.h, z21.h}  // 11000001-10110100-01010001-01011101
 // CHECK-INST: fmls    za.h[w10, 5, vgx2], { z10.h, z11.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x5d,0x51,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b4515d <unknown>
 
 fmls    za.h[w10, 5], {z10.h - z11.h}, {z20.h - z21.h}  // 11000001-10110100-01010001-01011101
 // CHECK-INST: fmls    za.h[w10, 5, vgx2], { z10.h, z11.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x5d,0x51,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b4515d <unknown>
 
 fmls    za.h[w11, 7, vgx2], {z12.h, z13.h}, {z8.h, z9.h}  // 11000001-10101000-01110001-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z12.h, z13.h }, { z8.h, z9.h }
 // CHECK-ENCODING: [0x9f,0x71,0xa8,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a8719f <unknown>
 
 fmls    za.h[w11, 7], {z12.h - z13.h}, {z8.h - z9.h}  // 11000001-10101000-01110001-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z12.h, z13.h }, { z8.h, z9.h }
 // CHECK-ENCODING: [0x9f,0x71,0xa8,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a8719f <unknown>
 
 fmls    za.h[w11, 7, vgx2], {z30.h, z31.h}, {z30.h, z31.h}  // 11000001-10111110-01110011-11011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z30.h, z31.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xdf,0x73,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be73df <unknown>
 
 fmls    za.h[w11, 7], {z30.h - z31.h}, {z30.h - z31.h}  // 11000001-10111110-01110011-11011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx2], { z30.h, z31.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xdf,0x73,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be73df <unknown>
 
 fmls    za.h[w8, 5, vgx2], {z16.h, z17.h}, {z16.h, z17.h}  // 11000001-10110000-00010010-00011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z16.h, z17.h }, { z16.h, z17.h }
 // CHECK-ENCODING: [0x1d,0x12,0xb0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b0121d <unknown>
 
 fmls    za.h[w8, 5], {z16.h - z17.h}, {z16.h - z17.h}  // 11000001-10110000-00010010-00011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z16.h, z17.h }, { z16.h, z17.h }
 // CHECK-ENCODING: [0x1d,0x12,0xb0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b0121d <unknown>
 
 fmls    za.h[w8, 1, vgx2], {z0.h, z1.h}, {z30.h, z31.h}  // 11000001-10111110-00010000-00011001
 // CHECK-INST: fmls    za.h[w8, 1, vgx2], { z0.h, z1.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0x19,0x10,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be1019 <unknown>
 
 fmls    za.h[w8, 1], {z0.h - z1.h}, {z30.h - z31.h}  // 11000001-10111110-00010000-00011001
 // CHECK-INST: fmls    za.h[w8, 1, vgx2], { z0.h, z1.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0x19,0x10,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be1019 <unknown>
 
 fmls    za.h[w10, 0, vgx2], {z18.h, z19.h}, {z20.h, z21.h}  // 11000001-10110100-01010010-01011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx2], { z18.h, z19.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x58,0x52,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b45258 <unknown>
 
 fmls    za.h[w10, 0], {z18.h - z19.h}, {z20.h - z21.h}  // 11000001-10110100-01010010-01011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx2], { z18.h, z19.h }, { z20.h, z21.h }
 // CHECK-ENCODING: [0x58,0x52,0xb4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b45258 <unknown>
 
 fmls    za.h[w8, 0, vgx2], {z12.h, z13.h}, {z2.h, z3.h}  // 11000001-10100010-00010001-10011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z12.h, z13.h }, { z2.h, z3.h }
 // CHECK-ENCODING: [0x98,0x11,0xa2,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a21198 <unknown>
 
 fmls    za.h[w8, 0], {z12.h - z13.h}, {z2.h - z3.h}  // 11000001-10100010-00010001-10011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx2], { z12.h, z13.h }, { z2.h, z3.h }
 // CHECK-ENCODING: [0x98,0x11,0xa2,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a21198 <unknown>
 
 fmls    za.h[w10, 1, vgx2], {z0.h, z1.h}, {z26.h, z27.h}  // 11000001-10111010-01010000-00011001
 // CHECK-INST: fmls    za.h[w10, 1, vgx2], { z0.h, z1.h }, { z26.h, z27.h }
 // CHECK-ENCODING: [0x19,0x50,0xba,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1ba5019 <unknown>
 
 fmls    za.h[w10, 1], {z0.h - z1.h}, {z26.h - z27.h}  // 11000001-10111010-01010000-00011001
 // CHECK-INST: fmls    za.h[w10, 1, vgx2], { z0.h, z1.h }, { z26.h, z27.h }
 // CHECK-ENCODING: [0x19,0x50,0xba,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1ba5019 <unknown>
 
 fmls    za.h[w8, 5, vgx2], {z22.h, z23.h}, {z30.h, z31.h}  // 11000001-10111110-00010010-11011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z22.h, z23.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xdd,0x12,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be12dd <unknown>
 
 fmls    za.h[w8, 5], {z22.h - z23.h}, {z30.h - z31.h}  // 11000001-10111110-00010010-11011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx2], { z22.h, z23.h }, { z30.h, z31.h }
 // CHECK-ENCODING: [0xdd,0x12,0xbe,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1be12dd <unknown>
 
 fmls    za.h[w11, 2, vgx2], {z8.h, z9.h}, {z0.h, z1.h}  // 11000001-10100000-01110001-00011010
 // CHECK-INST: fmls    za.h[w11, 2, vgx2], { z8.h, z9.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x1a,0x71,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0711a <unknown>
 
 fmls    za.h[w11, 2], {z8.h - z9.h}, {z0.h - z1.h}  // 11000001-10100000-01110001-00011010
 // CHECK-INST: fmls    za.h[w11, 2, vgx2], { z8.h, z9.h }, { z0.h, z1.h }
 // CHECK-ENCODING: [0x1a,0x71,0xa0,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a0711a <unknown>
 
 fmls    za.h[w9, 7, vgx2], {z12.h, z13.h}, {z10.h, z11.h}  // 11000001-10101010-00110001-10011111
 // CHECK-INST: fmls    za.h[w9, 7, vgx2], { z12.h, z13.h }, { z10.h, z11.h }
 // CHECK-ENCODING: [0x9f,0x31,0xaa,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1aa319f <unknown>
 
 fmls    za.h[w9, 7], {z12.h - z13.h}, {z10.h - z11.h}  // 11000001-10101010-00110001-10011111
 // CHECK-INST: fmls    za.h[w9, 7, vgx2], { z12.h, z13.h }, { z10.h, z11.h }
 // CHECK-ENCODING: [0x9f,0x31,0xaa,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1aa319f <unknown>
 
 fmls    za.h[w8, 0, vgx4], {z0.h - z3.h}, z0.h  // 11000001-00110000-00011100-00001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h
 // CHECK-ENCODING: [0x08,0x1c,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301c08 <unknown>
 
 fmls    za.h[w8, 0], {z0.h - z3.h}, z0.h  // 11000001-00110000-00011100-00001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h
 // CHECK-ENCODING: [0x08,0x1c,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301c08 <unknown>
 
 fmls    za.h[w10, 5, vgx4], {z10.h - z13.h}, z5.h  // 11000001-00110101-01011101-01001101
 // CHECK-INST: fmls    za.h[w10, 5, vgx4], { z10.h - z13.h }, z5.h
 // CHECK-ENCODING: [0x4d,0x5d,0x35,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1355d4d <unknown>
 
 fmls    za.h[w10, 5], {z10.h - z13.h}, z5.h  // 11000001-00110101-01011101-01001101
 // CHECK-INST: fmls    za.h[w10, 5, vgx4], { z10.h - z13.h }, z5.h
 // CHECK-ENCODING: [0x4d,0x5d,0x35,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1355d4d <unknown>
 
 fmls    za.h[w11, 7, vgx4], {z13.h - z16.h}, z8.h  // 11000001-00111000-01111101-10101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z13.h - z16.h }, z8.h
 // CHECK-ENCODING: [0xaf,0x7d,0x38,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1387daf <unknown>
 
 fmls    za.h[w11, 7], {z13.h - z16.h}, z8.h  // 11000001-00111000-01111101-10101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z13.h - z16.h }, z8.h
 // CHECK-ENCODING: [0xaf,0x7d,0x38,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1387daf <unknown>
 
 fmls    za.h[w11, 7, vgx4], {z31.h, z0.h, z1.h, z2.h}, z15.h  // 11000001-00111111-01111111-11101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z31.h, z0.h, z1.h, z2.h }, z15.h
 // CHECK-ENCODING: [0xef,0x7f,0x3f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13f7fef <unknown>
 
 fmls    za.h[w11, 7], {z31.h, z0.h, z1.h, z2.h}, z15.h  // 11000001-00111111-01111111-11101111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z31.h, z0.h, z1.h, z2.h }, z15.h
 // CHECK-ENCODING: [0xef,0x7f,0x3f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13f7fef <unknown>
 
 fmls    za.h[w8, 5, vgx4], {z17.h - z20.h}, z0.h  // 11000001-00110000-00011110-00101101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z17.h - z20.h }, z0.h
 // CHECK-ENCODING: [0x2d,0x1e,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301e2d <unknown>
 
 fmls    za.h[w8, 5], {z17.h - z20.h}, z0.h  // 11000001-00110000-00011110-00101101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z17.h - z20.h }, z0.h
 // CHECK-ENCODING: [0x2d,0x1e,0x30,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1301e2d <unknown>
 
 fmls    za.h[w8, 1, vgx4], {z1.h - z4.h}, z14.h  // 11000001-00111110-00011100-00101001
 // CHECK-INST: fmls    za.h[w8, 1, vgx4], { z1.h - z4.h }, z14.h
 // CHECK-ENCODING: [0x29,0x1c,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1c29 <unknown>
 
 fmls    za.h[w8, 1], {z1.h - z4.h}, z14.h  // 11000001-00111110-00011100-00101001
 // CHECK-INST: fmls    za.h[w8, 1, vgx4], { z1.h - z4.h }, z14.h
 // CHECK-ENCODING: [0x29,0x1c,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1c29 <unknown>
 
 fmls    za.h[w10, 0, vgx4], {z19.h - z22.h}, z4.h  // 11000001-00110100-01011110-01101000
 // CHECK-INST: fmls    za.h[w10, 0, vgx4], { z19.h - z22.h }, z4.h
 // CHECK-ENCODING: [0x68,0x5e,0x34,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1345e68 <unknown>
 
 fmls    za.h[w10, 0], {z19.h - z22.h}, z4.h  // 11000001-00110100-01011110-01101000
 // CHECK-INST: fmls    za.h[w10, 0, vgx4], { z19.h - z22.h }, z4.h
 // CHECK-ENCODING: [0x68,0x5e,0x34,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1345e68 <unknown>
 
 fmls    za.h[w8, 0, vgx4], {z12.h - z15.h}, z2.h  // 11000001-00110010-00011101-10001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h
 // CHECK-ENCODING: [0x88,0x1d,0x32,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1321d88 <unknown>
 
 fmls    za.h[w8, 0], {z12.h - z15.h}, z2.h  // 11000001-00110010-00011101-10001000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h
 // CHECK-ENCODING: [0x88,0x1d,0x32,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1321d88 <unknown>
 
 fmls    za.h[w10, 1, vgx4], {z1.h - z4.h}, z10.h  // 11000001-00111010-01011100-00101001
 // CHECK-INST: fmls    za.h[w10, 1, vgx4], { z1.h - z4.h }, z10.h
 // CHECK-ENCODING: [0x29,0x5c,0x3a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13a5c29 <unknown>
 
 fmls    za.h[w10, 1], {z1.h - z4.h}, z10.h  // 11000001-00111010-01011100-00101001
 // CHECK-INST: fmls    za.h[w10, 1, vgx4], { z1.h - z4.h }, z10.h
 // CHECK-ENCODING: [0x29,0x5c,0x3a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13a5c29 <unknown>
 
 fmls    za.h[w8, 5, vgx4], {z22.h - z25.h}, z14.h  // 11000001-00111110-00011110-11001101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z22.h - z25.h }, z14.h
 // CHECK-ENCODING: [0xcd,0x1e,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1ecd <unknown>
 
 fmls    za.h[w8, 5], {z22.h - z25.h}, z14.h  // 11000001-00111110-00011110-11001101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z22.h - z25.h }, z14.h
 // CHECK-ENCODING: [0xcd,0x1e,0x3e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13e1ecd <unknown>
 
 fmls    za.h[w11, 2, vgx4], {z9.h - z12.h}, z1.h  // 11000001-00110001-01111101-00101010
 // CHECK-INST: fmls    za.h[w11, 2, vgx4], { z9.h - z12.h }, z1.h
 // CHECK-ENCODING: [0x2a,0x7d,0x31,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1317d2a <unknown>
 
 fmls    za.h[w11, 2], {z9.h - z12.h}, z1.h  // 11000001-00110001-01111101-00101010
 // CHECK-INST: fmls    za.h[w11, 2, vgx4], { z9.h - z12.h }, z1.h
 // CHECK-ENCODING: [0x2a,0x7d,0x31,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1317d2a <unknown>
 
 fmls    za.h[w9, 7, vgx4], {z12.h - z15.h}, z11.h  // 11000001-00111011-00111101-10001111
 // CHECK-INST: fmls    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h
 // CHECK-ENCODING: [0x8f,0x3d,0x3b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13b3d8f <unknown>
 
 fmls    za.h[w9, 7], {z12.h - z15.h}, z11.h  // 11000001-00111011-00111101-10001111
 // CHECK-INST: fmls    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h
 // CHECK-ENCODING: [0x8f,0x3d,0x3b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c13b3d8f <unknown>
 
 fmls    za.h[w8, 0, vgx4], {z0.h - z3.h}, z0.h[0]  // 11000001-00010000-10010000-00010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h[0]
 // CHECK-ENCODING: [0x10,0x90,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109010 <unknown>
 
 fmls    za.h[w8, 0], {z0.h - z3.h}, z0.h[0]  // 11000001-00010000-10010000-00010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z0.h - z3.h }, z0.h[0]
 // CHECK-ENCODING: [0x10,0x90,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109010 <unknown>
 
 fmls    za.h[w10, 5, vgx4], {z8.h - z11.h}, z5.h[2]  // 11000001-00010101-11010101-00010101
 // CHECK-INST: fmls    za.h[w10, 5, vgx4], { z8.h - z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x15,0xd5,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c115d515 <unknown>
 
 fmls    za.h[w10, 5], {z8.h - z11.h}, z5.h[2]  // 11000001-00010101-11010101-00010101
 // CHECK-INST: fmls    za.h[w10, 5, vgx4], { z8.h - z11.h }, z5.h[2]
 // CHECK-ENCODING: [0x15,0xd5,0x15,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c115d515 <unknown>
 
 fmls    za.h[w11, 7, vgx4], {z12.h - z15.h}, z8.h[6]  // 11000001-00011000-11111101-10010111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z12.h - z15.h }, z8.h[6]
 // CHECK-ENCODING: [0x97,0xfd,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c118fd97 <unknown>
 
 fmls    za.h[w11, 7], {z12.h - z15.h}, z8.h[6]  // 11000001-00011000-11111101-10010111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z12.h - z15.h }, z8.h[6]
 // CHECK-ENCODING: [0x97,0xfd,0x18,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c118fd97 <unknown>
 
 fmls    za.h[w11, 7, vgx4], {z28.h - z31.h}, z15.h[7]  // 11000001-00011111-11111111-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z28.h - z31.h }, z15.h[7]
 // CHECK-ENCODING: [0x9f,0xff,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11fff9f <unknown>
 
 fmls    za.h[w11, 7], {z28.h - z31.h}, z15.h[7]  // 11000001-00011111-11111111-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z28.h - z31.h }, z15.h[7]
 // CHECK-ENCODING: [0x9f,0xff,0x1f,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11fff9f <unknown>
 
 fmls    za.h[w8, 5, vgx4], {z16.h - z19.h}, z0.h[6]  // 11000001-00010000-10011110-00010101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z16.h - z19.h }, z0.h[6]
 // CHECK-ENCODING: [0x15,0x9e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109e15 <unknown>
 
 fmls    za.h[w8, 5], {z16.h - z19.h}, z0.h[6]  // 11000001-00010000-10011110-00010101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z16.h - z19.h }, z0.h[6]
 // CHECK-ENCODING: [0x15,0x9e,0x10,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1109e15 <unknown>
 
 fmls    za.h[w8, 1, vgx4], {z0.h - z3.h}, z14.h[2]  // 11000001-00011110-10010100-00010001
 // CHECK-INST: fmls    za.h[w8, 1, vgx4], { z0.h - z3.h }, z14.h[2]
 // CHECK-ENCODING: [0x11,0x94,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9411 <unknown>
 
 fmls    za.h[w8, 1], {z0.h - z3.h}, z14.h[2]  // 11000001-00011110-10010100-00010001
 // CHECK-INST: fmls    za.h[w8, 1, vgx4], { z0.h - z3.h }, z14.h[2]
 // CHECK-ENCODING: [0x11,0x94,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9411 <unknown>
 
 fmls    za.h[w10, 0, vgx4], {z16.h - z19.h}, z4.h[3]  // 11000001-00010100-11010110-00011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx4], { z16.h - z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x18,0xd6,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c114d618 <unknown>
 
 fmls    za.h[w10, 0], {z16.h - z19.h}, z4.h[3]  // 11000001-00010100-11010110-00011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx4], { z16.h - z19.h }, z4.h[3]
 // CHECK-ENCODING: [0x18,0xd6,0x14,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c114d618 <unknown>
 
 fmls    za.h[w8, 0, vgx4], {z12.h - z15.h}, z2.h[4]  // 11000001-00010010-10011001-10010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h[4]
 // CHECK-ENCODING: [0x90,0x99,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1129990 <unknown>
 
 fmls    za.h[w8, 0], {z12.h - z15.h}, z2.h[4]  // 11000001-00010010-10011001-10010000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z12.h - z15.h }, z2.h[4]
 // CHECK-ENCODING: [0x90,0x99,0x12,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1129990 <unknown>
 
 fmls    za.h[w10, 1, vgx4], {z0.h - z3.h}, z10.h[4]  // 11000001-00011010-11011000-00010001
 // CHECK-INST: fmls    za.h[w10, 1, vgx4], { z0.h - z3.h }, z10.h[4]
 // CHECK-ENCODING: [0x11,0xd8,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11ad811 <unknown>
 
 fmls    za.h[w10, 1], {z0.h - z3.h}, z10.h[4]  // 11000001-00011010-11011000-00010001
 // CHECK-INST: fmls    za.h[w10, 1, vgx4], { z0.h - z3.h }, z10.h[4]
 // CHECK-ENCODING: [0x11,0xd8,0x1a,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11ad811 <unknown>
 
 fmls    za.h[w8, 5, vgx4], {z20.h - z23.h}, z14.h[5]  // 11000001-00011110-10011010-10011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z20.h - z23.h }, z14.h[5]
 // CHECK-ENCODING: [0x9d,0x9a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9a9d <unknown>
 
 fmls    za.h[w8, 5], {z20.h - z23.h}, z14.h[5]  // 11000001-00011110-10011010-10011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z20.h - z23.h }, z14.h[5]
 // CHECK-ENCODING: [0x9d,0x9a,0x1e,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11e9a9d <unknown>
 
 fmls    za.h[w11, 2, vgx4], {z8.h - z11.h}, z1.h[2]  // 11000001-00010001-11110101-00010010
 // CHECK-INST: fmls    za.h[w11, 2, vgx4], { z8.h - z11.h }, z1.h[2]
 // CHECK-ENCODING: [0x12,0xf5,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c111f512 <unknown>
 
 fmls    za.h[w11, 2], {z8.h - z11.h}, z1.h[2]  // 11000001-00010001-11110101-00010010
 // CHECK-INST: fmls    za.h[w11, 2, vgx4], { z8.h - z11.h }, z1.h[2]
 // CHECK-ENCODING: [0x12,0xf5,0x11,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c111f512 <unknown>
 
 fmls    za.h[w9, 7, vgx4], {z12.h - z15.h}, z11.h[4]  // 11000001-00011011-10111001-10010111
 // CHECK-INST: fmls    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h[4]
 // CHECK-ENCODING: [0x97,0xb9,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11bb997 <unknown>
 
 fmls    za.h[w9, 7], {z12.h - z15.h}, z11.h[4]  // 11000001-00011011-10111001-10010111
 // CHECK-INST: fmls    za.h[w9, 7, vgx4], { z12.h - z15.h }, z11.h[4]
 // CHECK-ENCODING: [0x97,0xb9,0x1b,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c11bb997 <unknown>
 
 fmls    za.h[w8, 0, vgx4], {z0.h - z3.h}, {z0.h - z3.h}  // 11000001-10100001-00010000-00011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z0.h - z3.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x18,0x10,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11018 <unknown>
 
 fmls    za.h[w8, 0], {z0.h - z3.h}, {z0.h - z3.h}  // 11000001-10100001-00010000-00011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z0.h - z3.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x18,0x10,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11018 <unknown>
 
 fmls    za.h[w10, 5, vgx4], {z8.h - z11.h}, {z20.h - z23.h}  // 11000001-10110101-01010001-00011101
 // CHECK-INST: fmls    za.h[w10, 5, vgx4], { z8.h - z11.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x1d,0x51,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b5511d <unknown>
 
 fmls    za.h[w10, 5], {z8.h - z11.h}, {z20.h - z23.h}  // 11000001-10110101-01010001-00011101
 // CHECK-INST: fmls    za.h[w10, 5, vgx4], { z8.h - z11.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x1d,0x51,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b5511d <unknown>
 
 fmls    za.h[w11, 7, vgx4], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-01110001-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x9f,0x71,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9719f <unknown>
 
 fmls    za.h[w11, 7], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-01110001-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x9f,0x71,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9719f <unknown>
 
 fmls    za.h[w11, 7, vgx4], {z28.h - z31.h}, {z28.h - z31.h}  // 11000001-10111101-01110011-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z28.h - z31.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x9f,0x73,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd739f <unknown>
 
 fmls    za.h[w11, 7], {z28.h - z31.h}, {z28.h - z31.h}  // 11000001-10111101-01110011-10011111
 // CHECK-INST: fmls    za.h[w11, 7, vgx4], { z28.h - z31.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x9f,0x73,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd739f <unknown>
 
 fmls    za.h[w8, 5, vgx4], {z16.h - z19.h}, {z16.h - z19.h}  // 11000001-10110001-00010010-00011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z16.h - z19.h }, { z16.h - z19.h }
 // CHECK-ENCODING: [0x1d,0x12,0xb1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b1121d <unknown>
 
 fmls    za.h[w8, 5], {z16.h - z19.h}, {z16.h - z19.h}  // 11000001-10110001-00010010-00011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z16.h - z19.h }, { z16.h - z19.h }
 // CHECK-ENCODING: [0x1d,0x12,0xb1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b1121d <unknown>
 
 fmls    za.h[w8, 1, vgx4], {z0.h - z3.h}, {z28.h - z31.h}  // 11000001-10111101-00010000-00011001
 // CHECK-INST: fmls    za.h[w8, 1, vgx4], { z0.h - z3.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x19,0x10,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd1019 <unknown>
 
 fmls    za.h[w8, 1], {z0.h - z3.h}, {z28.h - z31.h}  // 11000001-10111101-00010000-00011001
 // CHECK-INST: fmls    za.h[w8, 1, vgx4], { z0.h - z3.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x19,0x10,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd1019 <unknown>
 
 fmls    za.h[w10, 0, vgx4], {z16.h - z19.h}, {z20.h - z23.h}  // 11000001-10110101-01010010-00011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx4], { z16.h - z19.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x18,0x52,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b55218 <unknown>
 
 fmls    za.h[w10, 0], {z16.h - z19.h}, {z20.h - z23.h}  // 11000001-10110101-01010010-00011000
 // CHECK-INST: fmls    za.h[w10, 0, vgx4], { z16.h - z19.h }, { z20.h - z23.h }
 // CHECK-ENCODING: [0x18,0x52,0xb5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b55218 <unknown>
 
 fmls    za.h[w8, 0, vgx4], {z12.h - z15.h}, {z0.h - z3.h}  // 11000001-10100001-00010001-10011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z12.h - z15.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x98,0x11,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11198 <unknown>
 
 fmls    za.h[w8, 0], {z12.h - z15.h}, {z0.h - z3.h}  // 11000001-10100001-00010001-10011000
 // CHECK-INST: fmls    za.h[w8, 0, vgx4], { z12.h - z15.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x98,0x11,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a11198 <unknown>
 
 fmls    za.h[w10, 1, vgx4], {z0.h - z3.h}, {z24.h - z27.h}  // 11000001-10111001-01010000-00011001
 // CHECK-INST: fmls    za.h[w10, 1, vgx4], { z0.h - z3.h }, { z24.h - z27.h }
 // CHECK-ENCODING: [0x19,0x50,0xb9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b95019 <unknown>
 
 fmls    za.h[w10, 1], {z0.h - z3.h}, {z24.h - z27.h}  // 11000001-10111001-01010000-00011001
 // CHECK-INST: fmls    za.h[w10, 1, vgx4], { z0.h - z3.h }, { z24.h - z27.h }
 // CHECK-ENCODING: [0x19,0x50,0xb9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1b95019 <unknown>
 
 fmls    za.h[w8, 5, vgx4], {z20.h - z23.h}, {z28.h - z31.h}  // 11000001-10111101-00010010-10011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z20.h - z23.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x9d,0x12,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd129d <unknown>
 
 fmls    za.h[w8, 5], {z20.h - z23.h}, {z28.h - z31.h}  // 11000001-10111101-00010010-10011101
 // CHECK-INST: fmls    za.h[w8, 5, vgx4], { z20.h - z23.h }, { z28.h - z31.h }
 // CHECK-ENCODING: [0x9d,0x12,0xbd,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1bd129d <unknown>
 
 fmls    za.h[w11, 2, vgx4], {z8.h - z11.h}, {z0.h - z3.h}  // 11000001-10100001-01110001-00011010
 // CHECK-INST: fmls    za.h[w11, 2, vgx4], { z8.h - z11.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x1a,0x71,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a1711a <unknown>
 
 fmls    za.h[w11, 2], {z8.h - z11.h}, {z0.h - z3.h}  // 11000001-10100001-01110001-00011010
 // CHECK-INST: fmls    za.h[w11, 2, vgx4], { z8.h - z11.h }, { z0.h - z3.h }
 // CHECK-ENCODING: [0x1a,0x71,0xa1,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a1711a <unknown>
 
 fmls    za.h[w9, 7, vgx4], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-00110001-10011111
 // CHECK-INST: fmls    za.h[w9, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x9f,0x31,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9319f <unknown>
 
 fmls    za.h[w9, 7], {z12.h - z15.h}, {z8.h - z11.h}  // 11000001-10101001-00110001-10011111
 // CHECK-INST: fmls    za.h[w9, 7, vgx4], { z12.h - z15.h }, { z8.h - z11.h }
 // CHECK-ENCODING: [0x9f,0x31,0xa9,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: c1a9319f <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/fmopa-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fmopa-diagnostics.s
index def19a316c2a..1c561959c25e 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmopa-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmopa-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid predicate register
diff --git a/llvm/test/MC/AArch64/SME2p1/fmopa.s b/llvm/test/MC/AArch64/SME2p1/fmopa.s
index e53d21244fde..0a586d3acc42 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmopa.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmopa.s
@@ -1,85 +1,85 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d  --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fmopa   za0.h, p0/m, p0/m, z0.h, z0.h  // 10000001-10000000-00000000-00001000
 // CHECK-INST: fmopa   za0.h, p0/m, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x08,0x00,0x80,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81800008 <unknown>
 
 fmopa   za1.h, p5/m, p2/m, z10.h, z21.h  // 10000001-10010101-01010101-01001001
 // CHECK-INST: fmopa   za1.h, p5/m, p2/m, z10.h, z21.h
 // CHECK-ENCODING: [0x49,0x55,0x95,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81955549 <unknown>
 
 fmopa   za1.h, p3/m, p7/m, z13.h, z8.h  // 10000001-10001000-11101101-10101001
 // CHECK-INST: fmopa   za1.h, p3/m, p7/m, z13.h, z8.h
 // CHECK-ENCODING: [0xa9,0xed,0x88,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 8188eda9 <unknown>
 
 fmopa   za1.h, p7/m, p7/m, z31.h, z31.h  // 10000001-10011111-11111111-11101001
 // CHECK-INST: fmopa   za1.h, p7/m, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xe9,0xff,0x9f,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819fffe9 <unknown>
 
 fmopa   za1.h, p3/m, p0/m, z17.h, z16.h  // 10000001-10010000-00001110-00101001
 // CHECK-INST: fmopa   za1.h, p3/m, p0/m, z17.h, z16.h
 // CHECK-ENCODING: [0x29,0x0e,0x90,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81900e29 <unknown>
 
 fmopa   za1.h, p1/m, p4/m, z1.h, z30.h  // 10000001-10011110-10000100-00101001
 // CHECK-INST: fmopa   za1.h, p1/m, p4/m, z1.h, z30.h
 // CHECK-ENCODING: [0x29,0x84,0x9e,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819e8429 <unknown>
 
 fmopa   za0.h, p5/m, p2/m, z19.h, z20.h  // 10000001-10010100-01010110-01101000
 // CHECK-INST: fmopa   za0.h, p5/m, p2/m, z19.h, z20.h
 // CHECK-ENCODING: [0x68,0x56,0x94,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81945668 <unknown>
 
 fmopa   za0.h, p6/m, p0/m, z12.h, z2.h  // 10000001-10000010-00011001-10001000
 // CHECK-INST: fmopa   za0.h, p6/m, p0/m, z12.h, z2.h
 // CHECK-ENCODING: [0x88,0x19,0x82,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81821988 <unknown>
 
 fmopa   za1.h, p2/m, p6/m, z1.h, z26.h  // 10000001-10011010-11001000-00101001
 // CHECK-INST: fmopa   za1.h, p2/m, p6/m, z1.h, z26.h
 // CHECK-ENCODING: [0x29,0xc8,0x9a,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819ac829 <unknown>
 
 fmopa   za1.h, p2/m, p0/m, z22.h, z30.h  // 10000001-10011110-00001010-11001001
 // CHECK-INST: fmopa   za1.h, p2/m, p0/m, z22.h, z30.h
 // CHECK-ENCODING: [0xc9,0x0a,0x9e,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819e0ac9 <unknown>
 
 fmopa   za0.h, p5/m, p7/m, z9.h, z1.h  // 10000001-10000001-11110101-00101000
 // CHECK-INST: fmopa   za0.h, p5/m, p7/m, z9.h, z1.h
 // CHECK-ENCODING: [0x28,0xf5,0x81,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 8181f528 <unknown>
 
 fmopa   za1.h, p2/m, p5/m, z12.h, z11.h  // 10000001-10001011-10101001-10001001
 // CHECK-INST: fmopa   za1.h, p2/m, p5/m, z12.h, z11.h
 // CHECK-ENCODING: [0x89,0xa9,0x8b,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 818ba989 <unknown>
 
diff --git a/llvm/test/MC/AArch64/SME2p1/fmops-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fmops-diagnostics.s
index 75eea8113262..0ec227ae0e68 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmops-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmops-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 2>&1 < %s | FileCheck %s
 
 // --------------------------------------------------------------------------//
 // Invalid predicate register
diff --git a/llvm/test/MC/AArch64/SME2p1/fmops.s b/llvm/test/MC/AArch64/SME2p1/fmops.s
index 325d4c125b60..597665d59150 100644
--- a/llvm/test/MC/AArch64/SME2p1/fmops.s
+++ b/llvm/test/MC/AArch64/SME2p1/fmops.s
@@ -1,84 +1,84 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 fmops   za0.h, p0/m, p0/m, z0.h, z0.h  // 10000001-10000000-00000000-00011000
 // CHECK-INST: fmops   za0.h, p0/m, p0/m, z0.h, z0.h
 // CHECK-ENCODING: [0x18,0x00,0x80,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81800018 <unknown>
 
 fmops   za1.h, p5/m, p2/m, z10.h, z21.h  // 10000001-10010101-01010101-01011001
 // CHECK-INST: fmops   za1.h, p5/m, p2/m, z10.h, z21.h
 // CHECK-ENCODING: [0x59,0x55,0x95,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81955559 <unknown>
 
 fmops   za1.h, p3/m, p7/m, z13.h, z8.h  // 10000001-10001000-11101101-10111001
 // CHECK-INST: fmops   za1.h, p3/m, p7/m, z13.h, z8.h
 // CHECK-ENCODING: [0xb9,0xed,0x88,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 8188edb9 <unknown>
 
 fmops   za1.h, p7/m, p7/m, z31.h, z31.h  // 10000001-10011111-11111111-11111001
 // CHECK-INST: fmops   za1.h, p7/m, p7/m, z31.h, z31.h
 // CHECK-ENCODING: [0xf9,0xff,0x9f,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819ffff9 <unknown>
 
 fmops   za1.h, p3/m, p0/m, z17.h, z16.h  // 10000001-10010000-00001110-00111001
 // CHECK-INST: fmops   za1.h, p3/m, p0/m, z17.h, z16.h
 // CHECK-ENCODING: [0x39,0x0e,0x90,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81900e39 <unknown>
 
 fmops   za1.h, p1/m, p4/m, z1.h, z30.h  // 10000001-10011110-10000100-00111001
 // CHECK-INST: fmops   za1.h, p1/m, p4/m, z1.h, z30.h
 // CHECK-ENCODING: [0x39,0x84,0x9e,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819e8439 <unknown>
 
 fmops   za0.h, p5/m, p2/m, z19.h, z20.h  // 10000001-10010100-01010110-01111000
 // CHECK-INST: fmops   za0.h, p5/m, p2/m, z19.h, z20.h
 // CHECK-ENCODING: [0x78,0x56,0x94,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81945678 <unknown>
 
 fmops   za0.h, p6/m, p0/m, z12.h, z2.h  // 10000001-10000010-00011001-10011000
 // CHECK-INST: fmops   za0.h, p6/m, p0/m, z12.h, z2.h
 // CHECK-ENCODING: [0x98,0x19,0x82,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 81821998 <unknown>
 
 fmops   za1.h, p2/m, p6/m, z1.h, z26.h  // 10000001-10011010-11001000-00111001
 // CHECK-INST: fmops   za1.h, p2/m, p6/m, z1.h, z26.h
 // CHECK-ENCODING: [0x39,0xc8,0x9a,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819ac839 <unknown>
 
 fmops   za1.h, p2/m, p0/m, z22.h, z30.h  // 10000001-10011110-00001010-11011001
 // CHECK-INST: fmops   za1.h, p2/m, p0/m, z22.h, z30.h
 // CHECK-ENCODING: [0xd9,0x0a,0x9e,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 819e0ad9 <unknown>
 
 fmops   za0.h, p5/m, p7/m, z9.h, z1.h  // 10000001-10000001-11110101-00111000
 // CHECK-INST: fmops   za0.h, p5/m, p7/m, z9.h, z1.h
 // CHECK-ENCODING: [0x38,0xf5,0x81,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 8181f538 <unknown>
 
 fmops   za1.h, p2/m, p5/m, z12.h, z11.h  // 10000001-10001011-10101001-10011001
 // CHECK-INST: fmops   za1.h, p2/m, p5/m, z12.h, z11.h
 // CHECK-ENCODING: [0x99,0xa9,0x8b,0x81]
-// CHECK-ERROR: instruction requires: sme2p1 sme-f16f16
+// CHECK-ERROR: instruction requires: sme-f16f16
 // CHECK-UNKNOWN: 818ba999 <unknown>
diff --git a/llvm/test/MC/AArch64/SME2p1/fsub-diagnostics.s b/llvm/test/MC/AArch64/SME2p1/fsub-diagnostics.s
index 716427a2f725..60cef4260be9 100644
--- a/llvm/test/MC/AArch64/SME2p1/fsub-diagnostics.s
+++ b/llvm/test/MC/AArch64/SME2p1/fsub-diagnostics.s
@@ -1,5 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 2>&1 < %s | FileCheck %s
-
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 2>&1 < %s | FileCheck %s
 // --------------------------------------------------------------------------//
 // Out of range index offset
 
diff --git a/llvm/test/MC/AArch64/SME2p1/fsub.s b/llvm/test/MC/AArch64/SME2p1/fsub.s
index b3735d554765..66410008eb11 100644
--- a/llvm/test/MC/AArch64/SME2p1/fsub.s
+++ b/llvm/test/MC/AArch64/SME2p1/fsub.s
@@ -1,296 +1,298 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f8f16 < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
-// RUN:        | llvm-objdump -d --mattr=+sme2p1,+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme-f16f16 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme-f16f16 < %s \
 // RUN:        | llvm-objdump -d --mattr=-sme2p1 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p1,+sme-f16f16 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme-f16f16 < %s \
 // RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
-// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p1,+sme-f16f16 -disassemble -show-encoding \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme-f16f16 -disassemble -show-encoding \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 
 
 fsub    za.h[w8, 0], {z0.h - z1.h}  // 11000001-10100100-00011100-00001000
 // CHECK-INST: fsub    za.h[w8, 0, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x08,0x1c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41c08 <unknown>
 
 fsub    za.h[w10, 5, vgx2], {z10.h, z11.h}  // 11000001-10100100-01011101-01001101
 // CHECK-INST: fsub    za.h[w10, 5, vgx2], { z10.h, z11.h }
 // CHECK-ENCODING: [0x4d,0x5d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45d4d <unknown>
 
 fsub    za.h[w10, 5], {z10.h - z11.h}  // 11000001-10100100-01011101-01001101
 // CHECK-INST: fsub    za.h[w10, 5, vgx2], { z10.h, z11.h }
 // CHECK-ENCODING: [0x4d,0x5d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45d4d <unknown>
 
 fsub    za.h[w11, 7, vgx2], {z12.h, z13.h}  // 11000001-10100100-01111101-10001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x8f,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47d8f <unknown>
 
 fsub    za.h[w11, 7], {z12.h - z13.h}  // 11000001-10100100-01111101-10001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x8f,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47d8f <unknown>
 
 fsub    za.h[w11, 7, vgx2], {z30.h, z31.h}  // 11000001-10100100-01111111-11001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx2], { z30.h, z31.h }
 // CHECK-ENCODING: [0xcf,0x7f,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47fcf <unknown>
 
 fsub    za.h[w11, 7], {z30.h - z31.h}  // 11000001-10100100-01111111-11001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx2], { z30.h, z31.h }
 // CHECK-ENCODING: [0xcf,0x7f,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47fcf <unknown>
 
 fsub    za.h[w8, 5, vgx2], {z16.h, z17.h}  // 11000001-10100100-00011110-00001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx2], { z16.h, z17.h }
 // CHECK-ENCODING: [0x0d,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41e0d <unknown>
 
 fsub    za.h[w8, 5], {z16.h - z17.h}  // 11000001-10100100-00011110-00001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx2], { z16.h, z17.h }
 // CHECK-ENCODING: [0x0d,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41e0d <unknown>
 
 fsub    za.h[w8, 1, vgx2], {z0.h, z1.h}  // 11000001-10100100-00011100-00001001
 // CHECK-INST: fsub    za.h[w8, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x09,0x1c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41c09 <unknown>
 
 fsub    za.h[w8, 1], {z0.h - z1.h}  // 11000001-10100100-00011100-00001001
 // CHECK-INST: fsub    za.h[w8, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x09,0x1c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41c09 <unknown>
 
 fsub    za.h[w10, 0, vgx2], {z18.h, z19.h}  // 11000001-10100100-01011110, 01001000
 // CHECK-INST: fsub    za.h[w10, 0, vgx2], { z18.h, z19.h }
 // CHECK-ENCODING: [0x48,0x5e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45e48 <unknown>
 
 fsub    za.h[w10, 0], {z18.h - z19.h}  // 11000001-10100100-01011110-01001000
 // CHECK-INST: fsub    za.h[w10, 0, vgx2], { z18.h, z19.h }
 // CHECK-ENCODING: [0x48,0x5e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45e48 <unknown>
 
 fsub    za.h[w8, 0, vgx2], {z12.h, z13.h}  // 11000001-10100100-00011101-10001000
 // CHECK-INST: fsub    za.h[w8, 0, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x88,0x1d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41d88 <unknown>
 
 fsub    za.h[w8, 0], {z12.h - z13.h}  // 11000001-10100100-00011101-10001000
 // CHECK-INST: fsub    za.h[w8, 0, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x88,0x1d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41d88 <unknown>
 
 fsub    za.h[w10, 1, vgx2], {z0.h, z1.h}  // 11000001-10100100-01011100-00001001
 // CHECK-INST: fsub    za.h[w10, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x09,0x5c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45c09 <unknown>
 
 fsub    za.h[w10, 1], {z0.h - z1.h}  // 11000001-10100100-01011100-00001001
 // CHECK-INST: fsub    za.h[w10, 1, vgx2], { z0.h, z1.h }
 // CHECK-ENCODING: [0x09,0x5c,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a45c09 <unknown>
 
 fsub    za.h[w8, 5, vgx2], {z22.h, z23.h}  // 11000001-10100100-00011110, 11001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx2], { z22.h, z23.h }
 // CHECK-ENCODING: [0xcd,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41ecd <unknown>
 
 fsub    za.h[w8, 5], {z22.h - z23.h}  // 11000001-10100100-00011110-11001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx2], { z22.h, z23.h }
 // CHECK-ENCODING: [0xcd,0x1e,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a41ecd <unknown>
 
 fsub    za.h[w11, 2, vgx2], {z8.h, z9.h}  // 11000001-10100100-01111101-00001010
 // CHECK-INST: fsub    za.h[w11, 2, vgx2], { z8.h, z9.h }
 // CHECK-ENCODING: [0x0a,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47d0a <unknown>
 
 fsub    za.h[w11, 2], {z8.h - z9.h}  // 11000001-10100100-01111101-00001010
 // CHECK-INST: fsub    za.h[w11, 2, vgx2], { z8.h, z9.h }
 // CHECK-ENCODING: [0x0a,0x7d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a47d0a <unknown>
 
 fsub    za.h[w9, 7, vgx2], {z12.h, z13.h}  // 11000001-10100100-00111101-10001111
 // CHECK-INST: fsub    za.h[w9, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x8f,0x3d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a43d8f <unknown>
 
 fsub    za.h[w9, 7], {z12.h - z13.h}  // 11000001-10100100-00111101-10001111
 // CHECK-INST: fsub    za.h[w9, 7, vgx2], { z12.h, z13.h }
 // CHECK-ENCODING: [0x8f,0x3d,0xa4,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a43d8f <unknown>
 
 
 fsub    za.h[w8, 0, vgx4], {z0.h - z3.h}  // 11000001-10100101-00011100-00001000
 // CHECK-INST: fsub    za.h[w8, 0, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x08,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51c08 <unknown>
 
 fsub    za.h[w8, 0], {z0.h - z3.h}  // 11000001-10100101-00011100-00001000
 // CHECK-INST: fsub    za.h[w8, 0, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x08,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51c08 <unknown>
 
 fsub    za.h[w10, 5, vgx4], {z8.h - z11.h}  // 11000001-10100101-01011101-00001101
 // CHECK-INST: fsub    za.h[w10, 5, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x0d,0x5d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55d0d <unknown>
 
 fsub    za.h[w10, 5], {z8.h - z11.h}  // 11000001-10100101-01011101-00001101
 // CHECK-INST: fsub    za.h[w10, 5, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x0d,0x5d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55d0d <unknown>
 
 fsub    za.h[w11, 7, vgx4], {z12.h - z15.h}  // 11000001-10100101-01111101-10001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x8f,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57d8f <unknown>
 
 fsub    za.h[w11, 7], {z12.h - z15.h}  // 11000001-10100101-01111101-10001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x8f,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57d8f <unknown>
 
 fsub    za.h[w11, 7, vgx4], {z28.h - z31.h}  // 11000001-10100101-01111111-10001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx4], { z28.h - z31.h }
 // CHECK-ENCODING: [0x8f,0x7f,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57f8f <unknown>
 
 fsub    za.h[w11, 7], {z28.h - z31.h}  // 11000001-10100101-01111111-10001111
 // CHECK-INST: fsub    za.h[w11, 7, vgx4], { z28.h - z31.h }
 // CHECK-ENCODING: [0x8f,0x7f,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57f8f <unknown>
 
 fsub    za.h[w8, 5, vgx4], {z16.h - z19.h}  // 11000001-10100101-00011110-00001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x0d,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51e0d <unknown>
 
 fsub    za.h[w8, 5], {z16.h - z19.h}  // 11000001-10100101-00011110-00001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x0d,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51e0d <unknown>
 
 fsub    za.h[w8, 1, vgx4], {z0.h - z3.h}  // 11000001-10100101-00011100-00001001
 // CHECK-INST: fsub    za.h[w8, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x09,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51c09 <unknown>
 
 fsub    za.h[w8, 1], {z0.h - z3.h}  // 11000001-10100101-00011100-00001001
 // CHECK-INST: fsub    za.h[w8, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x09,0x1c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51c09 <unknown>
 
 fsub    za.h[w10, 0, vgx4], {z16.h - z19.h}  // 11000001-10100101-01011110-00001000
 // CHECK-INST: fsub    za.h[w10, 0, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x08,0x5e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55e08 <unknown>
 
 fsub    za.h[w10, 0], {z16.h - z19.h}  // 11000001-10100101-01011110-00001000
 // CHECK-INST: fsub    za.h[w10, 0, vgx4], { z16.h - z19.h }
 // CHECK-ENCODING: [0x08,0x5e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55e08 <unknown>
 
 fsub    za.h[w8, 0, vgx4], {z12.h - z15.h}  // 11000001-10100101-00011101-10001000
 // CHECK-INST: fsub    za.h[w8, 0, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x88,0x1d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51d88 <unknown>
 
 fsub    za.h[w8, 0], {z12.h - z15.h}  // 11000001-10100101-00011101-10001000
 // CHECK-INST: fsub    za.h[w8, 0, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x88,0x1d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51d88 <unknown>
 
 fsub    za.h[w10, 1, vgx4], {z0.h - z3.h}  // 11000001-10100101-01011100-00001001
 // CHECK-INST: fsub    za.h[w10, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x09,0x5c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55c09 <unknown>
 
 fsub    za.h[w10, 1], {z0.h - z3.h}  // 11000001-10100101-01011100-00001001
 // CHECK-INST: fsub    za.h[w10, 1, vgx4], { z0.h - z3.h }
 // CHECK-ENCODING: [0x09,0x5c,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a55c09 <unknown>
 
 fsub    za.h[w8, 5, vgx4], {z20.h - z23.h}  // 11000001-10100101-00011110-10001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx4], { z20.h - z23.h }
 // CHECK-ENCODING: [0x8d,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51e8d <unknown>
 
 fsub    za.h[w8, 5], {z20.h - z23.h}  // 11000001-10100101-00011110-10001101
 // CHECK-INST: fsub    za.h[w8, 5, vgx4], { z20.h - z23.h }
 // CHECK-ENCODING: [0x8d,0x1e,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a51e8d <unknown>
 
 fsub    za.h[w11, 2, vgx4], {z8.h - z11.h}  // 11000001-10100101-01111101-00001010
 // CHECK-INST: fsub    za.h[w11, 2, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x0a,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57d0a <unknown>
 
 fsub    za.h[w11, 2], {z8.h - z11.h}  // 11000001-10100101-01111101-00001010
 // CHECK-INST: fsub    za.h[w11, 2, vgx4], { z8.h - z11.h }
 // CHECK-ENCODING: [0x0a,0x7d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a57d0a <unknown>
 
 fsub    za.h[w9, 7, vgx4], {z12.h - z15.h}  // 11000001-10100101-00111101-10001111
 // CHECK-INST: fsub    za.h[w9, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x8f,0x3d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a53d8f <unknown>
 
 fsub    za.h[w9, 7], {z12.h - z15.h}  // 11000001-10100101-00111101-10001111
 // CHECK-INST: fsub    za.h[w9, 7, vgx4], { z12.h - z15.h }
 // CHECK-ENCODING: [0x8f,0x3d,0xa5,0xc1]
-// CHECK-ERROR: instruction requires: sme2p1
+// CHECK-ERROR: instruction requires: sme-f16f16 or sme-f8f16
 // CHECK-UNKNOWN: c1a53d8f <unknown>
diff --git a/llvm/test/MC/AMDGPU/ds-err.s b/llvm/test/MC/AMDGPU/ds-err.s
index 2d25fdf5e302..c31f4c759395 100644
--- a/llvm/test/MC/AMDGPU/ds-err.s
+++ b/llvm/test/MC/AMDGPU/ds-err.s
@@ -18,19 +18,19 @@ ds_write2_b32 v2, v4, v6 offset0:4 offset0:8
 ds_write2_b32 v2, v4, v6 offset1:4 offset1:8
 
 // offset0 too big
-// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid offset0 value.
 ds_write2_b32 v2, v4, v6 offset0:1000000000
 
 // offset0 too big
-// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid offset0 value.
 ds_write2_b32 v2, v4, v6 offset0:0x100
 
 // offset1 too big
-// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid offset1 value.
 ds_write2_b32 v2, v4, v6 offset1:1000000000
 
 // offset1 too big
-// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid offset1 value.
 ds_write2_b32 v2, v4, v6 offset1:0x100
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
index 3ec31626be5b..7f99afe01925 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
@@ -22,13 +22,13 @@ s_delay_alu instid0(VALU_DEP_1) | SALU_CYCLE_1)
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: expected a left parenthesis
 
 lds_direct_load v15 wait_vdst:16
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid wait_vdst value.
 
 lds_direct_load v15 wait_vdst
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 v_interp_p10_f32 v0, v1, v2, v3 wait_exp:8
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid wait_exp value.
 
 v_interp_p2_f32 v0, -v1, v2, v3 wait_exp
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
index be9edc3e019e..b0854881d428 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
@@ -1126,6 +1126,18 @@ v_cvt_sr_fp8_f32 v10, s2, v5
 v_cvt_sr_fp8_f32 v5, -|v255|, v4
 // GFX12: encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20]
 
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:0
+// GFX12: v_cvt_sr_fp8_f32 v1, v2, v3             ; encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:1
+// GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:1  ; encoding: [0x01,0x20,0x6b,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:2
+// GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:2  ; encoding: [0x01,0x40,0x6b,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:3
+// GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:3  ; encoding: [0x01,0x60,0x6b,0xd7,0x02,0x07,0x02,0x00]
+
 v_cvt_sr_bf8_f32 v1, v2, v3
 // GFX12: encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00]
 
@@ -1135,6 +1147,18 @@ v_cvt_sr_bf8_f32 v10, s2, v5
 v_cvt_sr_bf8_f32 v5, -|v255|, v4
 // GFX12: encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20]
 
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:0
+// GFX12: v_cvt_sr_bf8_f32 v1, v2, v3             ; encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:1
+// GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:1  ; encoding: [0x01,0x20,0x6c,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:2
+// GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:2  ; encoding: [0x01,0x40,0x6c,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:3
+// GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:3  ; encoding: [0x01,0x60,0x6c,0xd7,0x02,0x07,0x02,0x00]
+
 v_cvt_pk_i16_f32 v5, v1, v2
 // GFX12: encoding: [0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
index d0e309adce41..16cd8d5aa5e9 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
@@ -1192,6 +1192,18 @@ v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:
 v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1
 // GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
 
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:0 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:1 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x20,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:2 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x40,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:3 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x60,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
 v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
 // GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
 
@@ -1219,6 +1231,18 @@ v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:
 v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1
 // GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
 
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:0 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:1 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x20,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:2 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x40,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:3 quad_perm:[3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x60,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+
 v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
index 25b13ac62e4a..d6ef14cff5fa 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
@@ -698,6 +698,18 @@ v_cvt_sr_fp8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0]
 // GFX12: encoding: [0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x20,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x40,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x60,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
 v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -710,6 +722,18 @@ v_cvt_sr_bf8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0]
 // GFX12: encoding: [0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x20,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x40,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x60,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+
 v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s
index 55c5fcabea73..a9dd290ea67d 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_err.s
@@ -101,3 +101,8 @@ v_permlane16_var_b32 v5, v1, v2 op_sel:[0, 0, 1]
 // GFX12: error: invalid op_sel operand
 // GFX12-NEXT:{{^}}v_permlane16_var_b32 v5, v1, v2 op_sel:[0, 0, 1]
 // GFX12-NEXT:{{^}}                                ^
+
+v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:4
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: invalid byte_sel value.
+// GFX12-NEXT:{{^}}v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:4
+// GFX12-NEXT:{{^}}                            ^
diff --git a/llvm/test/MC/AMDGPU/pal-msgpack.s b/llvm/test/MC/AMDGPU/pal-msgpack.s
index 886cc8b0538b..03c6c547af8a 100644
--- a/llvm/test/MC/AMDGPU/pal-msgpack.s
+++ b/llvm/test/MC/AMDGPU/pal-msgpack.s
@@ -14,10 +14,10 @@ amdpal.pipelines:
       - 0x123456789abcdef0
       - 0xfedcba9876543210
     .registers:      
-      0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0
-      0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42000000
-      0xa1b3 (SPI_PS_INPUT_ENA): 0x1
-      0xa1b4 (SPI_PS_INPUT_ADDR): 0x1
+      '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0
+      '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42000000
+      '0xa1b3 (SPI_PS_INPUT_ENA)': 0x1
+      '0xa1b4 (SPI_PS_INPUT_ADDR)': 0x1
 ...
 	.end_amdgpu_pal_metadata
 
@@ -34,10 +34,10 @@ amdpal.pipelines:
 // ASM:       - 0x123456789abcdef0
 // ASM:       - 0xfedcba9876543210
 // ASM:     .registers:      
-// ASM:       0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0
-// ASM:       0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x42000000
-// ASM:       0xa1b3 (SPI_PS_INPUT_ENA): 0x1
-// ASM:       0xa1b4 (SPI_PS_INPUT_ADDR): 0x1
+// ASM:       '0x2c0a (SPI_SHADER_PGM_RSRC1_PS)': 0
+// ASM:       '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x42000000
+// ASM:       '0xa1b3 (SPI_PS_INPUT_ENA)': 0x1
+// ASM:       '0xa1b4 (SPI_PS_INPUT_ADDR)': 0x1
 // ASM: ...
 // ASM: 	.end_amdgpu_pal_metadata
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
index 6acaa8152720..2c911777ef97 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
@@ -1020,6 +1020,15 @@
 # GFX12: v_cvt_sr_fp8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20]
 0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20
 
+# GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:1  ; encoding: [0x01,0x20,0x6b,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x20,0x6b,0xd7,0x02,0x07,0x02,0x00
+
+# GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:2  ; encoding: [0x01,0x40,0x6b,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x40,0x6b,0xd7,0x02,0x07,0x02,0x00
+
+# GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 byte_sel:3  ; encoding: [0x01,0x60,0x6b,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x60,0x6b,0xd7,0x02,0x07,0x02,0x00
+
 # GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00]
 0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00
 
@@ -1029,6 +1038,15 @@
 # GFX12: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20]
 0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20
 
+# GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:1  ; encoding: [0x01,0x20,0x6c,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x20,0x6c,0xd7,0x02,0x07,0x02,0x00
+
+# GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:2  ; encoding: [0x01,0x40,0x6c,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x40,0x6c,0xd7,0x02,0x07,0x02,0x00
+
+# GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 byte_sel:3  ; encoding: [0x01,0x60,0x6c,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x60,0x6c,0xd7,0x02,0x07,0x02,0x00
+
 # GFX12: v_cvt_pk_i16_f32 v5, v1, v2             ; encoding: [0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
index 0771e6449b62..f9b6c1b73ddc 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
@@ -945,6 +945,15 @@
 # GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
 0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed
 
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x20,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+0x01,0x20,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x40,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+0x01,0x40,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x60,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+0x01,0x60,0x6c,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff
+
 # GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
 0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
 
@@ -972,6 +981,15 @@
 # GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
 0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed
 
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x20,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+0x01,0x20,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x40,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+0x01,0x40,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x60,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff]
+0x01,0x60,0x6b,0xd7,0xfa,0x06,0x02,0x00,0x02,0x1b,0x00,0xff
+
 # GFX12: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
index a836adafb31e..eedc6d491087 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
@@ -570,6 +570,15 @@
 # GFX12: v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
 
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x20,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+0x01,0x20,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x40,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+0x01,0x40,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, v2, v3 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x60,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+0x01,0x60,0x6b,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05
+
 # GFX12: v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
@@ -582,6 +591,15 @@
 # GFX12: v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
 
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x20,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+0x01,0x20,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x40,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+0x01,0x40,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, v2, v3 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x60,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05]
+0x01,0x60,0x6c,0xd7,0xe9,0x06,0x02,0x00,0x02,0x77,0x39,0x05
+
 # GFX12: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
diff --git a/llvm/test/MC/Disassembler/M68k/data.txt b/llvm/test/MC/Disassembler/M68k/data.txt
index 8e2fb3f13560..3951ea677f11 100644
--- a/llvm/test/MC/Disassembler/M68k/data.txt
+++ b/llvm/test/MC/Disassembler/M68k/data.txt
@@ -36,6 +36,12 @@
 # CHECK: move.l (64,%sp,%a0), %d0
 0x20 0x37 0x88 0x40
 
+# CHECK: move.b #234, %d2
+0x14 0x3c 0x00 0xea
+
+# CHECK: moveq #100, %d2
+0x74 0x64
+
 # CHECK: move.l $f0000000, %a5
 0x2a 0x79 0xf0 0x00 0x00 0x00
 
diff --git a/llvm/test/MC/M68k/Data/Classes/MxMove_RI.s b/llvm/test/MC/M68k/Data/Classes/MxMove_RI.s
index 091367a68256..2081924d7b17 100644
--- a/llvm/test/MC/M68k/Data/Classes/MxMove_RI.s
+++ b/llvm/test/MC/M68k/Data/Classes/MxMove_RI.s
@@ -9,3 +9,6 @@ move.l	#42, %a1
 ; CHECK:      move.l  #-1, %a1
 ; CHECK-SAME: encoding: [0x22,0x7c,0xff,0xff,0xff,0xff]
 move.l	#-1, %a1
+; CHECK:      moveq  #-17, %d3
+; CHECK-SAME: encoding: [0x76,0xef]
+moveq	#-17, %d3
diff --git a/llvm/test/MC/RISCV/XTHeadVdot-valid.s b/llvm/test/MC/RISCV/XTHeadVdot-valid.s
index 2e00bd1cac3e..ab411dfac730 100644
--- a/llvm/test/MC/RISCV/XTHeadVdot-valid.s
+++ b/llvm/test/MC/RISCV/XTHeadVdot-valid.s
@@ -12,82 +12,82 @@ th.vmaqau.vv v8, v20, v4, v0.t
 # CHECK-INST: th.vmaqau.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x0b,0x64,0x4a,0x88]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 4a 88 <unknown>
+# CHECK-UNKNOWN: 884a640b <unknown>
 
 th.vmaqau.vv v8, v20, v4
 # CHECK-INST: th.vmaqau.vv v8, v20, v4
 # CHECK-ENCODING: [0x0b,0x64,0x4a,0x8a]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 4a 8a <unknown>
+# CHECK-UNKNOWN: 8a4a640b <unknown>
 
 th.vmaqau.vx v8, a0, v4, v0.t
 # CHECK-INST: th.vmaqau.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x0b,0x64,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 45 8c <unknown>
+# CHECK-UNKNOWN: 8c45640b <unknown>
 
 th.vmaqau.vx v8, a0, v4
 # CHECK-INST: th.vmaqau.vx v8, a0, v4
 # CHECK-ENCODING: [0x0b,0x64,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 45 8e <unknown>
+# CHECK-UNKNOWN: 8e45640b <unknown>
 
 th.vmaqa.vv v8, v20, v4, v0.t
 # CHECK-INST: th.vmaqa.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x0b,0x64,0x4a,0x80]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 4a 80 <unknown>
+# CHECK-UNKNOWN: 804a640b <unknown>
 
 th.vmaqa.vv v8, v20, v4
 # CHECK-INST: th.vmaqa.vv v8, v20, v4
 # CHECK-ENCODING: [0x0b,0x64,0x4a,0x82]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 4a 82 <unknown>
+# CHECK-UNKNOWN: 824a640b <unknown>
 
 th.vmaqa.vx v8, a0, v4, v0.t
 # CHECK-INST: th.vmaqa.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x0b,0x64,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 45 84 <unknown>
+# CHECK-UNKNOWN: 8445640b <unknown>
 
 th.vmaqa.vx v8, a0, v4
 # CHECK-INST: th.vmaqa.vx v8, a0, v4
 # CHECK-ENCODING: [0x0b,0x64,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 45 86 <unknown>
+# CHECK-UNKNOWN: 8645640b <unknown>
 
 th.vmaqasu.vv v8, v20, v4, v0.t
 # CHECK-INST: th.vmaqasu.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x0b,0x64,0x4a,0x90]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 4a 90 <unknown>
+# CHECK-UNKNOWN: 904a640b <unknown>
 
 th.vmaqasu.vv v8, v20, v4
 # CHECK-INST: th.vmaqasu.vv v8, v20, v4
 # CHECK-ENCODING: [0x0b,0x64,0x4a,0x92]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 4a 92 <unknown>
+# CHECK-UNKNOWN: 924a640b <unknown>
 
 th.vmaqasu.vx v8, a0, v4, v0.t
 # CHECK-INST: th.vmaqasu.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x0b,0x64,0x45,0x94]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 45 94 <unknown>
+# CHECK-UNKNOWN: 9445640b <unknown>
 
 th.vmaqasu.vx v8, a0, v4
 # CHECK-INST: th.vmaqasu.vx v8, a0, v4
 # CHECK-ENCODING: [0x0b,0x64,0x45,0x96]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 45 96 <unknown>
+# CHECK-UNKNOWN: 9645640b <unknown>
 
 th.vmaqaus.vx v8, a0, v4, v0.t
 # CHECK-INST: th.vmaqaus.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x0b,0x64,0x45,0x9c]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 45 9c <unknown>
+# CHECK-UNKNOWN: 9c45640b <unknown>
 
 th.vmaqaus.vx v8, a0, v4
 # CHECK-INST: th.vmaqaus.vx v8, a0, v4
 # CHECK-ENCODING: [0x0b,0x64,0x45,0x9e]
 # CHECK-ERROR: instruction requires the following: 'xtheadvdot' (T-Head Vector Extensions for Dot){{$}}
-# CHECK-UNKNOWN: 0b 64 45 9e <unknown>
+# CHECK-UNKNOWN: 9e45640b <unknown>
diff --git a/llvm/test/MC/RISCV/align.s b/llvm/test/MC/RISCV/align.s
index 2eb7186d0de9..32cc071b613c 100644
--- a/llvm/test/MC/RISCV/align.s
+++ b/llvm/test/MC/RISCV/align.s
@@ -98,11 +98,11 @@ test:
 # The behavior is the same as GNU assembler.
 	.p2align 4, 1
 # RELAX-RELOC-NOT: R_RISCV_ALIGN - 0xC
-# RELAX-INST:  01 01
-# RELAX-INST:  01 01
+# RELAX-INST:  0101
+# RELAX-INST:  0101
 # C-OR-ZCA-EXT-RELAX-RELOC-NOT: R_RISCV_ALIGN - 0xE
-# C-OR-ZCA-EXT-RELAX-INST:  01 01
-# C-EXT-INST:  01 01
+# C-OR-ZCA-EXT-RELAX-INST:  0101
+# C-EXT-INST:  0101
 	ret
 # NORELAX-RELOC-NOT: R_RISCV
 # C-OR-ZCA-EXT-NORELAX-RELOC-NOT: R_RISCV
diff --git a/llvm/test/MC/RISCV/compress-cjal.s b/llvm/test/MC/RISCV/compress-cjal.s
index 31b9c30c2b01..d55586b005c7 100644
--- a/llvm/test/MC/RISCV/compress-cjal.s
+++ b/llvm/test/MC/RISCV/compress-cjal.s
@@ -11,7 +11,7 @@
 
 # c.jal is an rv32 only instruction.
 jal ra, 2046
-# CHECK-BYTES: fd 2f
+# CHECK-BYTES: 2ffd
 # CHECK-ALIASOBJ: jal 0x7fe
 # CHECK-ALIAS: jal 2046
 # CHECK-INST: c.jal 2046
diff --git a/llvm/test/MC/RISCV/compress-rv32d.s b/llvm/test/MC/RISCV/compress-rv32d.s
index bebc78ef8690..c41a08892862 100644
--- a/llvm/test/MC/RISCV/compress-rv32d.s
+++ b/llvm/test/MC/RISCV/compress-rv32d.s
@@ -43,22 +43,22 @@
 # Tests double precision floating point instructions available in rv32 and in rv64.
 
 fld ft0, 64(sp)
-# CHECK-BYTES: 06 20
+# CHECK-BYTES: 2006
 # CHECK-ALIAS: fld ft0, 64(sp)
 # CHECK-INST: c.fldsp ft0, 64(sp)
 # CHECK: # encoding:  [0x06,0x20]
 fsd ft0, 64(sp)
-# CHECK-BYTES: 82 a0
+# CHECK-BYTES: a082
 # CHECK-ALIAS: fsd ft0, 64(sp)
 # CHECK-INST: c.fsdsp ft0, 64(sp)
 # CHECK: # encoding:  [0x82,0xa0]
 fld fs0, 248(s0)
-# CHECK-BYTES: 60 3c
+# CHECK-BYTES: 3c60
 # CHECK-ALIAS: fld fs0, 248(s0)
 # CHECK-INST: c.fld fs0, 248(s0)
 # CHECK: # encoding:  [0x60,0x3c]
 fsd fs0, 248(s0)
-# CHECK-BYTES: 60 bc
+# CHECK-BYTES: bc60
 # CHECK-ALIAS: fsd fs0, 248(s0)
 # CHECK-INST: c.fsd fs0, 248(s0)
 # CHECK: # encoding:  [0x60,0xbc]
diff --git a/llvm/test/MC/RISCV/compress-rv32f.s b/llvm/test/MC/RISCV/compress-rv32f.s
index 3f0c69fb9893..afe15c598bb6 100644
--- a/llvm/test/MC/RISCV/compress-rv32f.s
+++ b/llvm/test/MC/RISCV/compress-rv32f.s
@@ -21,22 +21,22 @@
 
 # Instructions that are 32 bit only.
 flw ft0, 124(sp)
-# CHECK-BYTES: 76 70
+# CHECK-BYTES: 7076
 # CHECK-ALIAS: flw     ft0, 124(sp)
 # CHECK-INST: c.flwsp ft0, 124(sp)
 # CHECK:  # encoding: [0x76,0x70]
 fsw ft0, 124(sp)
-# CHECK-BYTES: 82 fe
+# CHECK-BYTES: fe82
 # CHECK-ALIAS: fsw ft0, 124(sp)
 # CHECK-INST: c.fswsp ft0, 124(sp)
 # CHECK:  # encoding: [0x82,0xfe]
 flw fs0, 124(s0)
-# CHECK-BYTES: 60 7c
+# CHECK-BYTES: 7c60
 # CHECK-ALIAS: flw fs0, 124(s0)
 # CHECK-INST: c.flw fs0, 124(s0)
 # CHECK:  # encoding:  [0x60,0x7c]
 fsw fs0, 124(s0)
-# CHECK-BYTES: 60 fc
+# CHECK-BYTES: fc60
 # CHECK-ALIAS: fsw fs0, 124(s0)
 # CHECK-INST: c.fsw fs0, 124(s0)
 # CHECK:  # encoding:  [0x60,0xfc]
diff --git a/llvm/test/MC/RISCV/compress-rv32i.s b/llvm/test/MC/RISCV/compress-rv32i.s
index b4fd72a0f81c..a75bea32ac0c 100644
--- a/llvm/test/MC/RISCV/compress-rv32i.s
+++ b/llvm/test/MC/RISCV/compress-rv32i.s
@@ -20,121 +20,121 @@
 # RUN:   | llvm-objdump  --triple=riscv64 --mattr=+c --no-print-imm-hex -d -M no-aliases - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-INST,CHECK-INSTOBJ64 %s
 
-# CHECK-BYTES: 2e 85
+# CHECK-BYTES: 852e
 # CHECK-ALIAS: mv a0, a1
 # CHECK-INST: c.mv a0, a1
 # CHECK: # encoding:  [0x2e,0x85]
 addi a0, a1, 0
 
-# CHECK-BYTES: e0 1f
+# CHECK-BYTES: 1fe0
 # CHECK-ALIAS: addi s0, sp, 1020
 # CHECK-INST: c.addi4spn s0, sp, 1020
 # CHECK: # encoding:  [0xe0,0x1f]
 addi s0, sp, 1020
 
-# CHECK-BYTES: e0 5f
+# CHECK-BYTES: 5fe0
 # CHECK-ALIAS: lw s0, 124(a5)
 # CHECK-INST: c.lw s0, 124(a5)
 # CHECK: # encoding: [0xe0,0x5f]
 lw s0, 124(a5)
 
-# CHECK-BYTES: e0 df
+# CHECK-BYTES: dfe0
 # CHECK-ALIAS: sw s0, 124(a5)
 # CHECK-INST: c.sw s0, 124(a5)
 # CHECK: # encoding: [0xe0,0xdf]
 sw s0, 124(a5)
 
-# CHECK-BYTES: 01 00
+# CHECK-BYTES: 0001
 # CHECK-ALIAS: nop
 # CHECK-INST: c.nop
 # CHECK: # encoding: [0x01,0x00]
 nop
 
-# CHECK-BYTES: 81 10
+# CHECK-BYTES: 1081
 # CHECK-ALIAS: addi ra, ra, -32
 # CHECK-INST: c.addi ra, -32
 # CHECK: # encoding:  [0x81,0x10]
 addi ra, ra, -32
 
-# CHECK-BYTES: 85 50
+# CHECK-BYTES: 5085
 # CHECK-ALIAS: li ra, -31
 # CHECK-INST: c.li ra, -31
 # CHECK: # encoding: [0x85,0x50]
 li ra, -31
 
-# CHECK-BYTES: 39 71
+# CHECK-BYTES: 7139
 # CHECK-ALIAS: addi sp, sp, -64
 # CHECK-INST: c.addi16sp sp, -64
 # CHECK:  # encoding: [0x39,0x71]
 addi sp, sp, -64
 
-# CHECK-BYTES: fd 61
+# CHECK-BYTES: 61fd
 # CHECK-ALIAS: lui gp, 31
 # CHECK-INST: c.lui gp, 31
 # CHECK: # encoding:  [0xfd,0x61]
 lui gp, 31
 
-# CHECK-BYTES: 7d 80
+# CHECK-BYTES: 807d
 # CHECK-ALIAS: srli s0, s0, 31
 # CHECK-INST: c.srli s0, 31
 # CHECK: # encoding:  [0x7d,0x80]
 srli s0, s0, 31
 
-# CHECK-BYTES: 7d 84
+# CHECK-BYTES: 847d
 # CHECK-ALIAS: srai s0, s0, 31
 # CHECK-INST: c.srai s0, 31
 # CHECK: # encoding: [0x7d,0x84]
 srai s0, s0, 31
 
-# CHECK-BYTES: 7d 88
+# CHECK-BYTES: 887d
 # CHECK-ALIAS: andi s0, s0, 31
 # CHECK-INST: c.andi s0, 31
 # CHECK: # encoding: [0x7d,0x88]
 andi s0, s0, 31
 
-# CHECK-BYTES: 1d 8c
+# CHECK-BYTES: 8c1d
 # CHECK-ALIAS: sub s0, s0, a5
 # CHECK-INST: c.sub s0, a5
 # CHECK: # encoding: [0x1d,0x8c]
 sub s0, s0, a5
 
-# CHECK-BYTES: 3d 8c
+# CHECK-BYTES: 8c3d
 # CHECK-ALIAS: xor s0, s0, a5
 # CHECK-INST: c.xor s0, a5
 # CHECK: # encoding: [0x3d,0x8c]
 xor s0, s0, a5
 
-# CHECK-BYTES: 3d 8c
+# CHECK-BYTES: 8c3d
 # CHECK-ALIAS: xor s0, s0, a5
 # CHECK-INST: c.xor s0, a5
 # CHECK: # encoding: [0x3d,0x8c]
 xor s0, a5, s0
 
-# CHECK-BYTES: 5d 8c
+# CHECK-BYTES: 8c5d
 # CHECK-ALIAS: or s0, s0, a5
 # CHECK-INST: c.or s0, a5
 # CHECK: # encoding:  [0x5d,0x8c]
 or s0, s0, a5
 
-# CHECK-BYTES: 45 8c
+# CHECK-BYTES: 8c45
 # CHECK-ALIAS: or s0, s0, s1
 # CHECK-INST: c.or s0, s1
 # CHECK:  # encoding: [0x45,0x8c]
 or  s0, s1, s0
 
-# CHECK-BYTES: 7d 8c
+# CHECK-BYTES: 8c7d
 # CHECK-ALIAS: and s0, s0, a5
 # CHECK-INST: c.and s0, a5
 # CHECK: # encoding: [0x7d,0x8c]
 and s0, s0, a5
 
-# CHECK-BYTES: 7d 8c
+# CHECK-BYTES: 8c7d
 # CHECK-ALIAS: and s0, s0, a5
 # CHECK-INST: c.and s0, a5
 # CHECK: # encoding: [0x7d,0x8c]
 and s0, a5, s0
 
-# CHECK-BYTES: 01 b0
+# CHECK-BYTES: b001
 # CHECK-ALIASASM: j -2048
 # CHECK-ALIASOBJ32: j 0xfffff826
 # CHECK-ALIASOBJ64: j 0xfffffffffffff826
@@ -144,7 +144,7 @@ and s0, a5, s0
 # CHECK:  # encoding: [0x01,0xb0]
 jal zero, -2048
 
-# CHECK-BYTES: 01 d0
+# CHECK-BYTES: d001
 # CHECK-ALIASASM: beqz s0, -256
 # CHECK-ALIASOBJ32: beqz s0, 0xffffff28
 # CHECK-ALIASOBJ64: beqz s0, 0xffffffffffffff28
@@ -154,7 +154,7 @@ jal zero, -2048
 # CHECK: # encoding: [0x01,0xd0]
 beq s0, zero, -256
 
-# CHECK-BYTES: 01 d0
+# CHECK-BYTES: d001
 # CHECK-ALIASASM: beqz s0, -256
 # CHECK-ALIASOBJ32: beqz s0, 0xffffff2a
 # CHECK-ALIASOBJ64: beqz s0, 0xffffffffffffff2a
@@ -164,7 +164,7 @@ beq s0, zero, -256
 # CHECK: # encoding: [0x01,0xd0]
 beq zero, s0, -256
 
-# CHECK-BYTES: 7d ec
+# CHECK-BYTES: ec7d
 # CHECK-ALIASASM: bnez s0, 254
 # CHECK-ALIASOBJ32: bnez s0, 0x12a
 # CHECK-ALIASOBJ64: bnez s0, 0x12a
@@ -174,7 +174,7 @@ beq zero, s0, -256
 # CHECK: # encoding: [0x7d,0xec]
 bne s0, zero, 254
 
-# CHECK-BYTES: 7d ec
+# CHECK-BYTES: ec7d
 # CHECK-ALIASASM: bnez s0, 254
 # CHECK-ALIASOBJ32: bnez s0, 0x12c
 # CHECK-ALIASOBJ64: bnez s0, 0x12c
@@ -184,67 +184,67 @@ bne s0, zero, 254
 # CHECK: # encoding: [0x7d,0xec]
 bne zero, s0, 254
 
-# CHECK-BYTES: 7e 04
+# CHECK-BYTES: 047e
 # CHECK-ALIAS: slli s0, s0, 31
 # CHECK-INST: c.slli s0, 31
 # CHECK: # encoding:  [0x7e,0x04]
 slli s0, s0, 31
 
-# CHECK-BYTES: fe 50
+# CHECK-BYTES: 50fe
 # CHECK-ALIAS: lw ra, 252(sp)
 # CHECK-INST: c.lwsp  ra, 252(sp)
 # CHECK: # encoding:  [0xfe,0x50]
 lw ra, 252(sp)
 
-# CHECK-BYTES: 82 80
+# CHECK-BYTES: 8082
 # CHECK-ALIAS: ret
 # CHECK-INST: c.jr ra
 # CHECK: # encoding:  [0x82,0x80]
 jalr zero, 0(ra)
 
-# CHECK-BYTES: 92 80
+# CHECK-BYTES: 8092
 # CHECK-ALIAS: mv ra, tp
 # CHECK-INST: c.mv ra, tp
 # CHECK:  # encoding: [0x92,0x80]
 add ra, zero, tp
 
-# CHECK-BYTES: 92 80
+# CHECK-BYTES: 8092
 # CHECK-ALIAS: mv ra, tp
 # CHECK-INST: c.mv ra, tp
 # CHECK:  # encoding: [0x92,0x80]
 add ra, tp, zero
 
-# CHECK-BYTES: 02 90
+# CHECK-BYTES: 9002
 # CHECK-ALIAS: ebreak
 # CHECK-INST: c.ebreak
 # CHECK: # encoding: [0x02,0x90]
 ebreak
 
-# CHECK-BYTES: 02 94
+# CHECK-BYTES: 9402
 # CHECK-ALIAS: jalr s0
 # CHECK-INST: c.jalr s0
 # CHECK: # encoding: [0x02,0x94]
 jalr ra, 0(s0)
 
-# CHECK-BYTES: 3e 94
+# CHECK-BYTES: 943e
 # CHECK-ALIAS: add s0, s0, a5
 # CHECK-INST: c.add s0, a5
 # CHECK: # encoding:  [0x3e,0x94]
 add s0, a5, s0
 
-# CHECK-BYTES: 3e 94
+# CHECK-BYTES: 943e
 # CHECK-ALIAS: add s0, s0, a5
 # CHECK-INST: c.add s0, a5
 # CHECK: # encoding:  [0x3e,0x94]
 add s0, s0, a5
 
-# CHECK-BYTES: 82 df
+# CHECK-BYTES: df82
 # CHECK-ALIAS: sw zero, 252(sp)
 # CHECK-INST: c.swsp zero, 252(sp)
 # CHECK: # encoding: [0x82,0xdf]
 sw zero, 252(sp)
 
-# CHECK-BYTES: 00 00
+# CHECK-BYTES: 0000
 # CHECK-ALIAS: unimp
 # CHECK-INST: c.unimp
 # CHECK: # encoding: [0x00,0x00]
diff --git a/llvm/test/MC/RISCV/compress-rv64i.s b/llvm/test/MC/RISCV/compress-rv64i.s
index 55d24f0d41c0..ab5b24307cd1 100644
--- a/llvm/test/MC/RISCV/compress-rv64i.s
+++ b/llvm/test/MC/RISCV/compress-rv64i.s
@@ -11,49 +11,49 @@
 
 # Tests compressed instructions available in rv64 and not in rv32.
 
-# CHECK-BYTES: e0 7f
+# CHECK-BYTES: 7fe0
 # CHECK-ALIAS: ld s0, 248(a5)
 # CHECK-INST: c.ld s0, 248(a5)
 # CHECK: # encoding: [0xe0,0x7f]
 ld s0, 248(a5)
 
-# CHECK-BYTES: a0 e3
+# CHECK-BYTES: e3a0
 # CHECK-ALIAS: sd s0, 64(a5)
 # CHECK-INST: c.sd s0, 64(a5)
 # CHECK: # encoding: [0xa0,0xe3]
 sd s0, 64(a5)
 
-# CHECK-BYTES: 7d 22
+# CHECK-BYTES: 227d
 # CHEACK-ALIAS: addiw tp, tp, 31
 # CHECK-INST: c.addiw  tp, 31
 # CHECK: # encoding: [0x7d,0x22]
 addiw tp, tp, 31
 
-# CHECK-BYTES: 1d 9c
+# CHECK-BYTES: 9c1d
 # CHEACK-ALIAS: subw s0, s0, a5
 # CHECK-INST: c.subw s0, a5
 # CHECK: # encoding:  [0x1d,0x9c]
 subw s0, s0, a5
 
-# CHECK-BYTES: 3d 9c
+# CHECK-BYTES: 9c3d
 # CHECK-ALIAS: addw s0, s0, a5
 # CHECK-INST: c.addw s0, a5
 # CHECK: # encoding: [0x3d,0x9c]
 addw s0, s0, a5
 
-# CHECK-BYTES: 3d 9c
+# CHECK-BYTES: 9c3d
 # CHECK-ALIAS: addw s0, s0, a5
 # CHECK-INST: c.addw s0, a5
 # CHECK: # encoding: [0x3d,0x9c]
 addw s0, a5, s0
 
-# CHECK-BYTES: ee 70
+# CHECK-BYTES: 70ee
 # CHECK-ALIAS: ld ra, 248(sp)
 # CHECK-INST: c.ldsp ra, 248(sp)
 # CHECK: # encoding:  [0xee,0x70]
 ld ra, 248(sp)
 
-# CHECK-BYTES: a2 e0
+# CHECK-BYTES: e0a2
 # CHECK-ALIAS: sd s0, 64(sp)
 # CHECK-INST: c.sdsp s0, 64(sp)
 # CHECK: # encoding: [0xa2,0xe0]
diff --git a/llvm/test/MC/RISCV/large-instructions.s b/llvm/test/MC/RISCV/large-instructions.s
new file mode 100644
index 000000000000..b50dbde17d38
--- /dev/null
+++ b/llvm/test/MC/RISCV/large-instructions.s
@@ -0,0 +1,29 @@
+# RUN: llvm-mc -filetype=obj -triple riscv32 < %s \
+# RUN:     | llvm-objdump -d - | FileCheck %s
+
+# CHECK: 011f 4523 8967 <unknown>
+.byte 0x1f, 0x01, 0x23, 0x45, 0x67, 0x89
+
+# CHECK: 4523013f cdab8967 <unknown>
+.byte 0x3f, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd
+
+# CHECK: 007f 4523 8967 cdab feef <unknown>
+.byte 0x7f, 0x00, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe
+
+# CHECK: 4523107f cdab8967 badcfeef <unknown>
+.byte 0x7f, 0x10, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba
+
+# CHECK: 207f 4523 8967 cdab feef badc 7698 <unknown>
+.byte 0x7f, 0x20, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76
+
+# CHECK: 4523307f cdab8967 badcfeef 32547698 <unknown>
+.byte 0x7f, 0x30, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32
+
+# CHECK: 407f 4523 8967 cdab feef badc 7698 3254 1210 <unknown>
+.byte 0x7f, 0x40, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, 0x12
+
+# CHECK: 4523507f cdab8967 badcfeef 32547698 56341210 <unknown>
+.byte 0x7f, 0x50, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, 0x12, 0x34, 0x56
+
+# CHECK: 607f 4523 8967 cdab feef badc 7698 3254 1210 5634 9a78 <unknown>
+.byte 0x7f, 0x60, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, 0x12, 0x34, 0x56, 0x78, 0x9a
diff --git a/llvm/test/MC/RISCV/nop-slide.s b/llvm/test/MC/RISCV/nop-slide.s
index f280d6e521e3..4dc888b3ba77 100644
--- a/llvm/test/MC/RISCV/nop-slide.s
+++ b/llvm/test/MC/RISCV/nop-slide.s
@@ -10,18 +10,18 @@
 auipc a0, 0
 
 # CHECK-RVC-NORELAX: 0000000000000000 <.text>:
-# CHECK-RVC-NORELAX-NEXT: 0: 00 00        	unimp
-# CHECK-RVC-NORELAX-NEXT: 2: 01 00        	nop
-# CHECK-RVC-NORELAX-NEXT: 4: 17 05 00 00  	auipc	a0, 0x0
+# CHECK-RVC-NORELAX-NEXT: 0: 0000      	unimp
+# CHECK-RVC-NORELAX-NEXT: 2: 0001      	nop
+# CHECK-RVC-NORELAX-NEXT: 4: 00000517  	auipc	a0, 0x0
 
 # CHECK-RVC-RELAX: 0000000000000000 <.text>:
-# CHECK-RVC-RELAX-NEXT:   0: 01 00        	nop
-# CHECK-RVC-RELAX-NEXT:   2: 00 01        	addi	s0, sp, 0x80
-# CHECK-RVC-RELAX-NEXT:   4: 00 17        	addi	s0, sp, 0x3a0
-# CHECK-RVC-RELAX-NEXT:   6: 05 00        	c.nop	0x1
-# CHECK-RVC-RELAX-NEXT:   8: 00           	<unknown>
+# CHECK-RVC-RELAX-NEXT:   0: 0001      	nop
+# CHECK-RVC-RELAX-NEXT:   2: 0100      	addi	s0, sp, 0x80
+# CHECK-RVC-RELAX-NEXT:   4: 1700      	addi	s0, sp, 0x3a0
+# CHECK-RVC-RELAX-NEXT:   6: 0005      	c.nop	0x1
+# CHECK-RVC-RELAX-NEXT:   8: 00        	<unknown>
 
 # CHECK: 0000000000000000 <.text>:
-# CHECK-NEXT: 0: 00 00        	<unknown>
-# CHECK-NEXT: 2: 00 00        	<unknown>
-# CHECK-NEXT: 4: 17 05 00 00  	auipc	a0, 0x0
+# CHECK-NEXT: 0: 0000      	<unknown>
+# CHECK-NEXT: 2: 0000      	<unknown>
+# CHECK-NEXT: 4: 00000517  	auipc	a0, 0x0
diff --git a/llvm/test/MC/RISCV/option-pushpop.s b/llvm/test/MC/RISCV/option-pushpop.s
index c830d16e590b..9c61b5dab5f3 100644
--- a/llvm/test/MC/RISCV/option-pushpop.s
+++ b/llvm/test/MC/RISCV/option-pushpop.s
@@ -25,7 +25,7 @@
 call foo
 
 # CHECK-INST: addi s0, sp, 1020
-# CHECK-BYTES: 13 04 c1 3f
+# CHECK-BYTES: 3fc10413
 # CHECK-ALIAS: addi s0, sp, 1020
 addi s0, sp, 1020
 
@@ -45,14 +45,14 @@ call bar
 .option rvc
 # CHECK-INST: .option rvc
 # CHECK-INST: c.addi4spn s0, sp, 1020
-# CHECK-BYTES: e0 1f
+# CHECK-BYTES: 1fe0
 # CHECK-ALIAS: addi s0, sp, 1020
 addi s0, sp, 1020
 
 .option pop     # Pop relax=true, rvc=false
 # CHECK-INST: .option pop
 # CHECK-INST: addi s0, sp, 1020
-# CHECK-BYTES: 13 04 c1 3f
+# CHECK-BYTES: 3fc10413
 # CHECK-ALIAS: addi s0, sp, 1020
 addi s0, sp, 1020
 
@@ -69,7 +69,7 @@ call bar
 call baz
 
 # CHECK-INST: addi s0, sp, 1020
-# CHECK-BYTES: 13 04 c1 3f
+# CHECK-BYTES: 3fc10413
 # CHECK-ALIAS: addi s0, sp, 1020
 addi s0, sp, 1020
 
diff --git a/llvm/test/MC/RISCV/option-rvc.s b/llvm/test/MC/RISCV/option-rvc.s
index 00c8ea167bcd..894fbab562d7 100644
--- a/llvm/test/MC/RISCV/option-rvc.s
+++ b/llvm/test/MC/RISCV/option-rvc.s
@@ -20,13 +20,13 @@
 # RUN:   | llvm-objdump  --triple=riscv64 --mattr=+c --no-print-imm-hex -d -M no-aliases - \
 # RUN:   | FileCheck -check-prefixes=CHECK-BYTES,CHECK-INST %s
 
-# CHECK-BYTES: 13 85 05 00
+# CHECK-BYTES: 00058513
 # CHECK-ALIAS: mv a0, a1
 # CHECK-INST: addi a0, a1, 0
 # CHECK: # encoding:  [0x13,0x85,0x05,0x00]
 addi a0, a1, 0
 
-# CHECK-BYTES: 13 04 c1 3f
+# CHECK-BYTES: 3fc10413
 # CHECK-ALIAS: addi s0, sp, 1020
 # CHECK-INST: addi s0, sp, 1020
 # CHECK: # encoding:  [0x13,0x04,0xc1,0x3f]
@@ -35,13 +35,13 @@ addi s0, sp, 1020
 
 # CHECK: .option rvc
 .option rvc
-# CHECK-BYTES: 2e 85
+# CHECK-BYTES: 852e
 # CHECK-ALIAS: mv a0, a1
 # CHECK-INST: c.mv a0, a1
 # CHECK: # encoding:  [0x2e,0x85]
 addi a0, a1, 0
 
-# CHECK-BYTES: e0 1f
+# CHECK-BYTES: 1fe0
 # CHECK-ALIAS: addi s0, sp, 1020
 # CHECK-INST: c.addi4spn s0, sp, 1020
 # CHECK: # encoding:  [0xe0,0x1f]
@@ -49,13 +49,13 @@ addi s0, sp, 1020
 
 # CHECK: .option norvc
 .option norvc
-# CHECK-BYTES: 13 85 05 00
+# CHECK-BYTES: 00058513
 # CHECK-ALIAS: mv a0, a1
 # CHECK-INST: addi a0, a1, 0
 # CHECK: # encoding:  [0x13,0x85,0x05,0x00]
 addi a0, a1, 0
 
-# CHECK-BYTES: 13 04 c1 3f
+# CHECK-BYTES: 3fc10413
 # CHECK-ALIAS: addi s0, sp, 1020
 # CHECK-INST: addi s0, sp, 1020
 # CHECK: # encoding:  [0x13,0x04,0xc1,0x3f]
@@ -63,13 +63,13 @@ addi s0, sp, 1020
 
 # CHECK: .option rvc
 .option rvc
-# CHECK-BYTES: 2e 85
+# CHECK-BYTES: 852e
 # CHECK-ALIAS: mv a0, a1
 # CHECK-INST: c.mv a0, a1
 # CHECK: # encoding:  [0x2e,0x85]
 addi a0, a1, 0
 
-# CHECK-BYTES: e0 1f
+# CHECK-BYTES: 1fe0
 # CHECK-ALIAS: addi s0, sp, 1020
 # CHECK-INST: c.addi4spn s0, sp, 1020
 # CHECK: # encoding:  [0xe0,0x1f]
@@ -77,13 +77,13 @@ addi s0, sp, 1020
 
 # CHECK: .option norvc
 .option norvc
-# CHECK-BYTES: 13 85 05 00
+# CHECK-BYTES: 00058513
 # CHECK-ALIAS: mv a0, a1
 # CHECK-INST: addi a0, a1, 0
 # CHECK: # encoding:  [0x13,0x85,0x05,0x00]
 addi a0, a1, 0
 
-# CHECK-BYTES: 13 04 c1 3f
+# CHECK-BYTES: 3fc10413
 # CHECK-ALIAS: addi s0, sp, 1020
 # CHECK-INST: addi s0, sp, 1020
 # CHECK: # encoding:  [0x13,0x04,0xc1,0x3f]
diff --git a/llvm/test/MC/RISCV/rv32e-invalid.s b/llvm/test/MC/RISCV/rv32e-invalid.s
index 9c19d3f40bcf..95dc156f250a 100644
--- a/llvm/test/MC/RISCV/rv32e-invalid.s
+++ b/llvm/test/MC/RISCV/rv32e-invalid.s
@@ -11,100 +11,100 @@
 # are rejected for RV32E/RV64E, when both assembling and disassembling.
 
 
-# CHECK-DIS: 37 18 00 00 <unknown>
+# CHECK-DIS: 00001837 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x16, 1
-# CHECK-DIS: b7 28 00 00 <unknown>
+# CHECK-DIS: 000028b7 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x17, 2
-# CHECK-DIS: 37 39 00 00 <unknown>
+# CHECK-DIS: 00003937 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x18, 3
-# CHECK-DIS: b7 49 00 00 <unknown>
+# CHECK-DIS: 000049b7 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x19, 4
-# CHECK-DIS: 37 5a 00 00 <unknown>
+# CHECK-DIS: 00005a37 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x20, 5
-# CHECK-DIS: b7 6a 00 00 <unknown>
+# CHECK-DIS: 00006ab7 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x21, 6
-# CHECK-DIS: 37 7b 00 00 <unknown>
+# CHECK-DIS: 00007b37 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x22, 7
-# CHECK-DIS: b7 8b 00 00 <unknown>
+# CHECK-DIS: 00008bb7 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x23, 8
-# CHECK-DIS: 37 9c 00 00 <unknown>
+# CHECK-DIS: 00009c37 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x24, 9
-# CHECK-DIS: b7 ac 00 00 <unknown>
+# CHECK-DIS: 0000acb7 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x25, 10
-# CHECK-DIS: 37 bd 00 00 <unknown>
+# CHECK-DIS: 0000bd37 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x26, 11
-# CHECK-DIS: b7 cd 00 00 <unknown>
+# CHECK-DIS: 0000cdb7 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x27, 12
-# CHECK-DIS: 37 de 00 00 <unknown>
+# CHECK-DIS: 0000de37 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x28, 13
-# CHECK-DIS: b7 ee 00 00 <unknown>
+# CHECK-DIS: 0000eeb7 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x29, 14
-# CHECK-DIS: 37 ff 00 00 <unknown>
+# CHECK-DIS: 0000ff37 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x30, 15
-# CHECK-DIS: b7 0f 01 00 <unknown>
+# CHECK-DIS: 00010fb7 <unknown>
 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction
 lui x31, 16
 
-# CHECK-DIS: 17 18 01 00 <unknown>
+# CHECK-DIS: 00011817 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc a6, 17
-# CHECK-DIS: 97 28 01 00 <unknown>
+# CHECK-DIS: 00012897 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc a7, 18
-# CHECK-DIS: 17 39 01 00 <unknown>
+# CHECK-DIS: 00013917 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc s2, 19
-# CHECK-DIS: 97 49 01 00 <unknown>
+# CHECK-DIS: 00014997 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc s3, 20
-# CHECK-DIS: 17 5a 01 00 <unknown>
+# CHECK-DIS: 00015a17 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc s4, 21
-# CHECK-DIS: 97 6a 01 00 <unknown>
+# CHECK-DIS: 00016a97 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc s5, 22
-# CHECK-DIS: 17 7b 01 00 <unknown>
+# CHECK-DIS: 00017b17 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc s6, 23
-# CHECK-DIS: 97 8b 01 00 <unknown>
+# CHECK-DIS: 00018b97 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc s7, 24
-# CHECK-DIS: 17 9c 01 00 <unknown>
+# CHECK-DIS: 00019c17 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc s8, 25
-# CHECK-DIS: 97 ac 01 00 <unknown>
+# CHECK-DIS: 0001ac97 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc s9, 26
-# CHECK-DIS: 17 bd 01 00 <unknown>
+# CHECK-DIS: 0001bd17 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc s10, 27
-# CHECK-DIS: 97 cd 01 00 <unknown>
+# CHECK-DIS: 0001cd97 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc s11, 28
-# CHECK-DIS: 17 de 01 00 <unknown>
+# CHECK-DIS: 0001de17 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc t3, 29
-# CHECK-DIS: 97 ee 01 00 <unknown>
+# CHECK-DIS: 0001ee97 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc t4, 30
-# CHECK-DIS: 17 ff 01 00 <unknown>
+# CHECK-DIS: 0001ff17 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc t5, 31
-# CHECK-DIS: 97 0f 02 00 <unknown>
+# CHECK-DIS: 00020f97 <unknown>
 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction
 auipc t6, 32
diff --git a/llvm/test/MC/RISCV/rvv/add.s b/llvm/test/MC/RISCV/rvv/add.s
index 89cef5dc0a4c..ebfe50f2d958 100644
--- a/llvm/test/MC/RISCV/rvv/add.s
+++ b/llvm/test/MC/RISCV/rvv/add.s
@@ -12,352 +12,352 @@ vadd.vv v8, v4, v20, v0.t
 # CHECK-INST: vadd.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 00 <unknown>
+# CHECK-UNKNOWN: 004a0457 <unknown>
 
 vadd.vv v8, v4, v20
 # CHECK-INST: vadd.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 02 <unknown>
+# CHECK-UNKNOWN: 024a0457 <unknown>
 
 vadd.vx v8, v4, a0, v0.t
 # CHECK-INST: vadd.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 00 <unknown>
+# CHECK-UNKNOWN: 00454457 <unknown>
 
 vadd.vx v8, v4, a0
 # CHECK-INST: vadd.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 02 <unknown>
+# CHECK-UNKNOWN: 02454457 <unknown>
 
 vadd.vi v8, v4, 15, v0.t
 # CHECK-INST: vadd.vi v8, v4, 15, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 00 <unknown>
+# CHECK-UNKNOWN: 0047b457 <unknown>
 
 vadd.vi v8, v4, 15
 # CHECK-INST: vadd.vi v8, v4, 15
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 02 <unknown>
+# CHECK-UNKNOWN: 0247b457 <unknown>
 
 vwaddu.vv v8, v4, v20, v0.t
 # CHECK-INST: vwaddu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a c0 <unknown>
+# CHECK-UNKNOWN: c04a2457 <unknown>
 
 vwaddu.vv v8, v4, v20
 # CHECK-INST: vwaddu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a c2 <unknown>
+# CHECK-UNKNOWN: c24a2457 <unknown>
 
 vwaddu.vx v8, v4, a0, v0.t
 # CHECK-INST: vwaddu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 c0 <unknown>
+# CHECK-UNKNOWN: c0456457 <unknown>
 
 vwaddu.vx v8, v4, a0
 # CHECK-INST: vwaddu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 c2 <unknown>
+# CHECK-UNKNOWN: c2456457 <unknown>
 
 vwadd.vv v8, v4, v20, v0.t
 # CHECK-INST: vwadd.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a c4 <unknown>
+# CHECK-UNKNOWN: c44a2457 <unknown>
 
 vwadd.vv v8, v4, v20
 # CHECK-INST: vwadd.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a c6 <unknown>
+# CHECK-UNKNOWN: c64a2457 <unknown>
 
 vwadd.vx v8, v4, a0, v0.t
 # CHECK-INST: vwadd.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 c4 <unknown>
+# CHECK-UNKNOWN: c4456457 <unknown>
 
 vwadd.vx v8, v4, a0
 # CHECK-INST: vwadd.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 c6 <unknown>
+# CHECK-UNKNOWN: c6456457 <unknown>
 
 vwaddu.wv v8, v4, v20, v0.t
 # CHECK-INST: vwaddu.wv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xd0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a d0 <unknown>
+# CHECK-UNKNOWN: d04a2457 <unknown>
 
 vwaddu.wv v8, v4, v20
 # CHECK-INST: vwaddu.wv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xd2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a d2 <unknown>
+# CHECK-UNKNOWN: d24a2457 <unknown>
 
 vwaddu.wx v8, v4, a0, v0.t
 # CHECK-INST: vwaddu.wx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xd0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 d0 <unknown>
+# CHECK-UNKNOWN: d0456457 <unknown>
 
 vwaddu.wx v8, v4, a0
 # CHECK-INST: vwaddu.wx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xd2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 d2 <unknown>
+# CHECK-UNKNOWN: d2456457 <unknown>
 
 vwadd.wv v8, v4, v20, v0.t
 # CHECK-INST: vwadd.wv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xd4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a d4 <unknown>
+# CHECK-UNKNOWN: d44a2457 <unknown>
 
 vwadd.wv v8, v4, v20
 # CHECK-INST: vwadd.wv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xd6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a d6 <unknown>
+# CHECK-UNKNOWN: d64a2457 <unknown>
 
 vwadd.wx v8, v4, a0, v0.t
 # CHECK-INST: vwadd.wx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xd4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 d4 <unknown>
+# CHECK-UNKNOWN: d4456457 <unknown>
 
 vwadd.wx v8, v4, a0
 # CHECK-INST: vwadd.wx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xd6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 d6 <unknown>
+# CHECK-UNKNOWN: d6456457 <unknown>
 
 vadc.vvm v8, v4, v20, v0
 # CHECK-INST: vadc.vvm v8, v4, v20, v0
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 40 <unknown>
+# CHECK-UNKNOWN: 404a0457 <unknown>
 
 vadc.vvm v4, v4, v20, v0
 # CHECK-INST: vadc.vvm v4, v4, v20, v0
 # CHECK-ENCODING: [0x57,0x02,0x4a,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 02 4a 40 <unknown>
+# CHECK-UNKNOWN: 404a0257 <unknown>
 
 vadc.vvm v8, v4, v8, v0
 # CHECK-INST: vadc.vvm v8, v4, v8, v0
 # CHECK-ENCODING: [0x57,0x04,0x44,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 44 40 <unknown>
+# CHECK-UNKNOWN: 40440457 <unknown>
 
 vadc.vxm v8, v4, a0, v0
 # CHECK-INST: vadc.vxm v8, v4, a0, v0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 40 <unknown>
+# CHECK-UNKNOWN: 40454457 <unknown>
 
 vadc.vim v8, v4, 15, v0
 # CHECK-INST: vadc.vim v8, v4, 15, v0
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 40 <unknown>
+# CHECK-UNKNOWN: 4047b457 <unknown>
 
 vmadc.vvm v8, v4, v20, v0
 # CHECK-INST: vmadc.vvm v8, v4, v20, v0
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 44 <unknown>
+# CHECK-UNKNOWN: 444a0457 <unknown>
 
 vmadc.vvm v4, v4, v20, v0
 # CHECK-INST: vmadc.vvm v4, v4, v20, v0
 # CHECK-ENCODING: [0x57,0x02,0x4a,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 02 4a 44 <unknown>
+# CHECK-UNKNOWN: 444a0257 <unknown>
 
 vmadc.vvm v8, v4, v8, v0
 # CHECK-INST: vmadc.vvm v8, v4, v8, v0
 # CHECK-ENCODING: [0x57,0x04,0x44,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 44 44 <unknown>
+# CHECK-UNKNOWN: 44440457 <unknown>
 
 vmadc.vxm v8, v4, a0, v0
 # CHECK-INST: vmadc.vxm v8, v4, a0, v0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 44 <unknown>
+# CHECK-UNKNOWN: 44454457 <unknown>
 
 vmadc.vim v8, v4, 15, v0
 # CHECK-INST: vmadc.vim v8, v4, 15, v0
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 44 <unknown>
+# CHECK-UNKNOWN: 4447b457 <unknown>
 
 vmadc.vv v8, v4, v20
 # CHECK-INST: vmadc.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 46 <unknown>
+# CHECK-UNKNOWN: 464a0457 <unknown>
 
 vmadc.vx v8, v4, a0
 # CHECK-INST: vmadc.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 46 <unknown>
+# CHECK-UNKNOWN: 46454457 <unknown>
 
 vmadc.vi v8, v4, 15
 # CHECK-INST: vmadc.vi v8, v4, 15
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 46 <unknown>
+# CHECK-UNKNOWN: 4647b457 <unknown>
 
 vsaddu.vv v8, v4, v20, v0.t
 # CHECK-INST: vsaddu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 80 <unknown>
+# CHECK-UNKNOWN: 804a0457 <unknown>
 
 vsaddu.vv v8, v4, v20
 # CHECK-INST: vsaddu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 82 <unknown>
+# CHECK-UNKNOWN: 824a0457 <unknown>
 
 vsaddu.vx v8, v4, a0, v0.t
 # CHECK-INST: vsaddu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 80 <unknown>
+# CHECK-UNKNOWN: 80454457 <unknown>
 
 vsaddu.vx v8, v4, a0
 # CHECK-INST: vsaddu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 82 <unknown>
+# CHECK-UNKNOWN: 82454457 <unknown>
 
 vsaddu.vi v8, v4, 15, v0.t
 # CHECK-INST: vsaddu.vi v8, v4, 15, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 80 <unknown>
+# CHECK-UNKNOWN: 8047b457 <unknown>
 
 vsaddu.vi v8, v4, 15
 # CHECK-INST: vsaddu.vi v8, v4, 15
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 82 <unknown>
+# CHECK-UNKNOWN: 8247b457 <unknown>
 
 vsadd.vv v8, v4, v20, v0.t
 # CHECK-INST: vsadd.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 84 <unknown>
+# CHECK-UNKNOWN: 844a0457 <unknown>
 
 vsadd.vv v8, v4, v20
 # CHECK-INST: vsadd.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 86 <unknown>
+# CHECK-UNKNOWN: 864a0457 <unknown>
 
 vsadd.vx v8, v4, a0, v0.t
 # CHECK-INST: vsadd.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 84 <unknown>
+# CHECK-UNKNOWN: 84454457 <unknown>
 
 vsadd.vx v8, v4, a0
 # CHECK-INST: vsadd.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 86 <unknown>
+# CHECK-UNKNOWN: 86454457 <unknown>
 
 vsadd.vi v8, v4, 15, v0.t
 # CHECK-INST: vsadd.vi v8, v4, 15, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 84 <unknown>
+# CHECK-UNKNOWN: 8447b457 <unknown>
 
 vsadd.vi v8, v4, 15
 # CHECK-INST: vsadd.vi v8, v4, 15
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 86 <unknown>
+# CHECK-UNKNOWN: 8647b457 <unknown>
 
 vaadd.vv v8, v4, v20, v0.t
 # CHECK-INST: vaadd.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 24 <unknown>
+# CHECK-UNKNOWN: 244a2457 <unknown>
 
 vaadd.vv v8, v4, v20
 # CHECK-INST: vaadd.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 26 <unknown>
+# CHECK-UNKNOWN: 264a2457 <unknown>
 
 vaadd.vx v8, v4, a0, v0.t
 # CHECK-INST: vaadd.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 24 <unknown>
+# CHECK-UNKNOWN: 24456457 <unknown>
 
 vaadd.vx v8, v4, a0
 # CHECK-INST: vaadd.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 26 <unknown>
+# CHECK-UNKNOWN: 26456457 <unknown>
 
 vaaddu.vv v8, v4, v20, v0.t
 # CHECK-INST: vaaddu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 20 <unknown>
+# CHECK-UNKNOWN: 204a2457 <unknown>
 
 vaaddu.vv v8, v4, v20
 # CHECK-INST: vaaddu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 22 <unknown>
+# CHECK-UNKNOWN: 224a2457 <unknown>
 
 vaaddu.vx v8, v4, a0, v0.t
 # CHECK-INST: vaaddu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 20 <unknown>
+# CHECK-UNKNOWN: 20456457 <unknown>
 
 vaaddu.vx v8, v4, a0
 # CHECK-INST: vaaddu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 22 <unknown>
+# CHECK-UNKNOWN: 22456457 <unknown>
 
 vwcvt.x.x.v v8, v4, v0.t
 # CHECK-INST: vwcvt.x.x.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x40,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 40 c4 <unknown>
+# CHECK-UNKNOWN: c4406457 <unknown>
 
 vwcvt.x.x.v v8, v4
 # CHECK-INST: vwcvt.x.x.v v8, v4
 # CHECK-ENCODING: [0x57,0x64,0x40,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 40 c6 <unknown>
+# CHECK-UNKNOWN: c6406457 <unknown>
 
 vwcvtu.x.x.v v8, v4, v0.t
 # CHECK-INST: vwcvtu.x.x.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x40,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 40 c0 <unknown>
+# CHECK-UNKNOWN: c0406457 <unknown>
 
 vwcvtu.x.x.v v8, v4
 # CHECK-INST: vwcvtu.x.x.v v8, v4
 # CHECK-ENCODING: [0x57,0x64,0x40,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 40 c2 <unknown>
+# CHECK-UNKNOWN: c2406457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/and.s b/llvm/test/MC/RISCV/rvv/and.s
index 894263fe0152..b1182c175a5d 100644
--- a/llvm/test/MC/RISCV/rvv/and.s
+++ b/llvm/test/MC/RISCV/rvv/and.s
@@ -12,34 +12,34 @@ vand.vv v8, v4, v20, v0.t
 # CHECK-INST: vand.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 24 <unknown>
+# CHECK-UNKNOWN: 244a0457 <unknown>
 
 vand.vv v8, v4, v20
 # CHECK-INST: vand.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 26 <unknown>
+# CHECK-UNKNOWN: 264a0457 <unknown>
 
 vand.vx v8, v4, a0, v0.t
 # CHECK-INST: vand.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 24 <unknown>
+# CHECK-UNKNOWN: 24454457 <unknown>
 
 vand.vx v8, v4, a0
 # CHECK-INST: vand.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 26 <unknown>
+# CHECK-UNKNOWN: 26454457 <unknown>
 
 vand.vi v8, v4, 15, v0.t
 # CHECK-INST: vand.vi v8, v4, 15, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 24 <unknown>
+# CHECK-UNKNOWN: 2447b457 <unknown>
 
 vand.vi v8, v4, 15
 # CHECK-INST: vand.vi v8, v4, 15
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 26 <unknown>
+# CHECK-UNKNOWN: 2647b457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/clip.s b/llvm/test/MC/RISCV/rvv/clip.s
index f4fb2c576b30..70c23d8ec0b1 100644
--- a/llvm/test/MC/RISCV/rvv/clip.s
+++ b/llvm/test/MC/RISCV/rvv/clip.s
@@ -12,70 +12,70 @@ vnclipu.wv v8, v4, v20, v0.t
 # CHECK-INST: vnclipu.wv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xb8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a b8 <unknown>
+# CHECK-UNKNOWN: b84a0457 <unknown>
 
 vnclipu.wv v8, v4, v20
 # CHECK-INST: vnclipu.wv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xba]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a ba <unknown>
+# CHECK-UNKNOWN: ba4a0457 <unknown>
 
 vnclipu.wx v8, v4, a0, v0.t
 # CHECK-INST: vnclipu.wx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0xb8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 b8 <unknown>
+# CHECK-UNKNOWN: b8454457 <unknown>
 
 vnclipu.wx v8, v4, a0
 # CHECK-INST: vnclipu.wx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0xba]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 ba <unknown>
+# CHECK-UNKNOWN: ba454457 <unknown>
 
 vnclipu.wi v8, v4, 31, v0.t
 # CHECK-INST: vnclipu.wi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xb8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f b8 <unknown>
+# CHECK-UNKNOWN: b84fb457 <unknown>
 
 vnclipu.wi v8, v4, 31
 # CHECK-INST: vnclipu.wi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xba]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f ba <unknown>
+# CHECK-UNKNOWN: ba4fb457 <unknown>
 
 vnclip.wv v8, v4, v20, v0.t
 # CHECK-INST: vnclip.wv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xbc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a bc <unknown>
+# CHECK-UNKNOWN: bc4a0457 <unknown>
 
 vnclip.wv v8, v4, v20
 # CHECK-INST: vnclip.wv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xbe]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a be <unknown>
+# CHECK-UNKNOWN: be4a0457 <unknown>
 
 vnclip.wx v8, v4, a0, v0.t
 # CHECK-INST: vnclip.wx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0xbc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 bc <unknown>
+# CHECK-UNKNOWN: bc454457 <unknown>
 
 vnclip.wx v8, v4, a0
 # CHECK-INST: vnclip.wx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0xbe]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 be <unknown>
+# CHECK-UNKNOWN: be454457 <unknown>
 
 vnclip.wi v8, v4, 31, v0.t
 # CHECK-INST: vnclip.wi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xbc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f bc <unknown>
+# CHECK-UNKNOWN: bc4fb457 <unknown>
 
 vnclip.wi v8, v4, 31
 # CHECK-INST: vnclip.wi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xbe]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f be <unknown>
+# CHECK-UNKNOWN: be4fb457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/compare.s b/llvm/test/MC/RISCV/rvv/compare.s
index fe7c1144a3c0..b1b9518a1d5a 100644
--- a/llvm/test/MC/RISCV/rvv/compare.s
+++ b/llvm/test/MC/RISCV/rvv/compare.s
@@ -12,367 +12,367 @@ vmslt.vv v0, v4, v20, v0.t
 # CHECK-INST: vmslt.vv v0, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x00,0x4a,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 00 4a 6c <unknown>
+# CHECK-UNKNOWN: 6c4a0057 <unknown>
 
 vmseq.vv v8, v4, v20, v0.t
 # CHECK-INST: vmseq.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 60 <unknown>
+# CHECK-UNKNOWN: 604a0457 <unknown>
 
 vmseq.vv v8, v4, v20
 # CHECK-INST: vmseq.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 62 <unknown>
+# CHECK-UNKNOWN: 624a0457 <unknown>
 
 vmseq.vx v8, v4, a0, v0.t
 # CHECK-INST: vmseq.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 60 <unknown>
+# CHECK-UNKNOWN: 60454457 <unknown>
 
 vmseq.vx v8, v4, a0
 # CHECK-INST: vmseq.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 62 <unknown>
+# CHECK-UNKNOWN: 62454457 <unknown>
 
 vmseq.vi v8, v4, 15, v0.t
 # CHECK-INST: vmseq.vi v8, v4, 0xf, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 60 <unknown>
+# CHECK-UNKNOWN: 6047b457 <unknown>
 
 vmseq.vi v8, v4, 15
 # CHECK-INST: vmseq.vi v8, v4, 0xf
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 62 <unknown>
+# CHECK-UNKNOWN: 6247b457 <unknown>
 
 vmsne.vv v8, v4, v20, v0.t
 # CHECK-INST: vmsne.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 64 <unknown>
+# CHECK-UNKNOWN: 644a0457 <unknown>
 
 vmsne.vv v8, v4, v20
 # CHECK-INST: vmsne.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 66 <unknown>
+# CHECK-UNKNOWN: 664a0457 <unknown>
 
 vmsne.vx v8, v4, a0, v0.t
 # CHECK-INST: vmsne.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 64 <unknown>
+# CHECK-UNKNOWN: 64454457 <unknown>
 
 vmsne.vx v8, v4, a0
 # CHECK-INST: vmsne.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 66 <unknown>
+# CHECK-UNKNOWN: 66454457 <unknown>
 
 vmsne.vi v8, v4, 15, v0.t
 # CHECK-INST: vmsne.vi v8, v4, 0xf, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 64 <unknown>
+# CHECK-UNKNOWN: 6447b457 <unknown>
 
 vmsne.vi v8, v4, 15
 # CHECK-INST: vmsne.vi v8, v4, 0xf
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 66 <unknown>
+# CHECK-UNKNOWN: 6647b457 <unknown>
 
 vmsltu.vv v8, v4, v20, v0.t
 # CHECK-INST: vmsltu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 68 <unknown>
+# CHECK-UNKNOWN: 684a0457 <unknown>
 
 vmsltu.vv v8, v4, v20
 # CHECK-INST: vmsltu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 6a <unknown>
+# CHECK-UNKNOWN: 6a4a0457 <unknown>
 
 vmsltu.vx v8, v4, a0, v0.t
 # CHECK-INST: vmsltu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 68 <unknown>
+# CHECK-UNKNOWN: 68454457 <unknown>
 
 vmsltu.vx v8, v4, a0
 # CHECK-INST: vmsltu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 6a <unknown>
+# CHECK-UNKNOWN: 6a454457 <unknown>
 
 vmslt.vv v8, v4, v20, v0.t
 # CHECK-INST: vmslt.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 6c <unknown>
+# CHECK-UNKNOWN: 6c4a0457 <unknown>
 
 vmslt.vv v8, v4, v20
 # CHECK-INST: vmslt.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 6e <unknown>
+# CHECK-UNKNOWN: 6e4a0457 <unknown>
 
 vmslt.vx v8, v4, a0, v0.t
 # CHECK-INST: vmslt.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 6c <unknown>
+# CHECK-UNKNOWN: 6c454457 <unknown>
 
 vmslt.vx v8, v4, a0
 # CHECK-INST: vmslt.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 6e <unknown>
+# CHECK-UNKNOWN: 6e454457 <unknown>
 
 vmsleu.vv v8, v4, v20, v0.t
 # CHECK-INST: vmsleu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x70]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 70 <unknown>
+# CHECK-UNKNOWN: 704a0457 <unknown>
 
 vmsleu.vv v8, v4, v20
 # CHECK-INST: vmsleu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x72]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 72 <unknown>
+# CHECK-UNKNOWN: 724a0457 <unknown>
 
 vmsleu.vx v8, v4, a0, v0.t
 # CHECK-INST: vmsleu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x70]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 70 <unknown>
+# CHECK-UNKNOWN: 70454457 <unknown>
 
 vmsleu.vx v8, v4, a0
 # CHECK-INST: vmsleu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x72]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 72 <unknown>
+# CHECK-UNKNOWN: 72454457 <unknown>
 
 vmsleu.vi v8, v4, 15, v0.t
 # CHECK-INST: vmsleu.vi v8, v4, 0xf, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x70]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 70 <unknown>
+# CHECK-UNKNOWN: 7047b457 <unknown>
 
 vmsleu.vi v8, v4, 15
 # CHECK-INST: vmsleu.vi v8, v4, 0xf
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x72]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 72 <unknown>
+# CHECK-UNKNOWN: 7247b457 <unknown>
 
 vmsle.vv v8, v4, v20, v0.t
 # CHECK-INST: vmsle.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x74]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 74 <unknown>
+# CHECK-UNKNOWN: 744a0457 <unknown>
 
 vmsle.vv v8, v4, v20
 # CHECK-INST: vmsle.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x76]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 76 <unknown>
+# CHECK-UNKNOWN: 764a0457 <unknown>
 
 vmsle.vx v8, v4, a0, v0.t
 # CHECK-INST: vmsle.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x74]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 74 <unknown>
+# CHECK-UNKNOWN: 74454457 <unknown>
 
 vmsle.vx v8, v4, a0
 # CHECK-INST: vmsle.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x76]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 76 <unknown>
+# CHECK-UNKNOWN: 76454457 <unknown>
 
 vmsle.vi v8, v4, 15, v0.t
 # CHECK-INST: vmsle.vi v8, v4, 0xf, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x74]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 74 <unknown>
+# CHECK-UNKNOWN: 7447b457 <unknown>
 
 vmsle.vi v8, v4, 15
 # CHECK-INST: vmsle.vi v8, v4, 0xf
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x76]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 76 <unknown>
+# CHECK-UNKNOWN: 7647b457 <unknown>
 
 vmsgtu.vx v8, v4, a0, v0.t
 # CHECK-INST: vmsgtu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x78]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 78 <unknown>
+# CHECK-UNKNOWN: 78454457 <unknown>
 
 vmsgtu.vx v8, v4, a0
 # CHECK-INST: vmsgtu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x7a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 7a <unknown>
+# CHECK-UNKNOWN: 7a454457 <unknown>
 
 vmsgtu.vi v8, v4, 15, v0.t
 # CHECK-INST: vmsgtu.vi v8, v4, 0xf, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x78]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 78 <unknown>
+# CHECK-UNKNOWN: 7847b457 <unknown>
 
 vmsgtu.vi v8, v4, 15
 # CHECK-INST: vmsgtu.vi v8, v4, 0xf
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x7a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 7a <unknown>
+# CHECK-UNKNOWN: 7a47b457 <unknown>
 
 vmsgt.vx v8, v4, a0, v0.t
 # CHECK-INST: vmsgt.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x7c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 7c <unknown>
+# CHECK-UNKNOWN: 7c454457 <unknown>
 
 vmsgt.vx v8, v4, a0
 # CHECK-INST: vmsgt.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x7e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 7e <unknown>
+# CHECK-UNKNOWN: 7e454457 <unknown>
 
 vmsgt.vi v8, v4, 15, v0.t
 # CHECK-INST: vmsgt.vi v8, v4, 0xf, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x7c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 7c <unknown>
+# CHECK-UNKNOWN: 7c47b457 <unknown>
 
 vmsgt.vi v8, v4, 15
 # CHECK-INST: vmsgt.vi v8, v4, 0xf
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x7e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 7e <unknown>
+# CHECK-UNKNOWN: 7e47b457 <unknown>
 
 vmsgtu.vv v8, v20, v4, v0.t
 # CHECK-INST: vmsltu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 68 <unknown>
+# CHECK-UNKNOWN: 684a0457 <unknown>
 
 vmsgtu.vv v8, v20, v4
 # CHECK-INST: vmsltu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 6a <unknown>
+# CHECK-UNKNOWN: 6a4a0457 <unknown>
 
 vmsgt.vv v8, v20, v4, v0.t
 # CHECK-INST: vmslt.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 6c <unknown>
+# CHECK-UNKNOWN: 6c4a0457 <unknown>
 
 vmsgt.vv v8, v20, v4
 # CHECK-INST: vmslt.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 6e <unknown>
+# CHECK-UNKNOWN: 6e4a0457 <unknown>
 
 vmsgeu.vv v8, v20, v4, v0.t
 # CHECK-INST: vmsleu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x70]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 70 <unknown>
+# CHECK-UNKNOWN: 704a0457 <unknown>
 
 vmsgeu.vv v8, v20, v4
 # CHECK-INST: vmsleu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x72]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 72 <unknown>
+# CHECK-UNKNOWN: 724a0457 <unknown>
 
 vmsge.vv v8, v20, v4, v0.t
 # CHECK-INST: vmsle.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x74]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 74 <unknown>
+# CHECK-UNKNOWN: 744a0457 <unknown>
 
 vmsge.vv v8, v20, v4
 # CHECK-INST: vmsle.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x76]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 76 <unknown>
+# CHECK-UNKNOWN: 764a0457 <unknown>
 
 vmsltu.vi v8, v4, 16, v0.t
 # CHECK-INST: vmsleu.vi v8, v4, 0xf, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x70]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 70 <unknown>
+# CHECK-UNKNOWN: 7047b457 <unknown>
 
 vmsltu.vi v8, v4, 16
 # CHECK-INST: vmsleu.vi v8, v4, 0xf
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x72]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 72 <unknown>
+# CHECK-UNKNOWN: 7247b457 <unknown>
 
 vmsltu.vi v8, v4, 0, v0.t
 # CHECK-INST: vmsne.vv v8, v4, v4, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x42,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 42 64 <unknown>
+# CHECK-UNKNOWN: 64420457 <unknown>
 
 vmsltu.vi v8, v4, 0
 # CHECK-INST: vmsne.vv v8, v4, v4
 # CHECK-ENCODING: [0x57,0x04,0x42,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 42 66 <unknown>
+# CHECK-UNKNOWN: 66420457 <unknown>
 
 vmslt.vi v8, v4, 16, v0.t
 # CHECK-INST: vmsle.vi v8, v4, 0xf, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x74]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 74 <unknown>
+# CHECK-UNKNOWN: 7447b457 <unknown>
 
 vmslt.vi v8, v4, 16
 # CHECK-INST: vmsle.vi v8, v4, 0xf
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x76]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 76 <unknown>
+# CHECK-UNKNOWN: 7647b457 <unknown>
 
 vmsgeu.vi v8, v4, 16, v0.t
 # CHECK-INST: vmsgtu.vi v8, v4, 0xf, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x78]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 78 <unknown>
+# CHECK-UNKNOWN: 7847b457 <unknown>
 
 vmsgeu.vi v8, v4, 16
 # CHECK-INST: vmsgtu.vi v8, v4, 0xf
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x7a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 7a <unknown>
+# CHECK-UNKNOWN: 7a47b457 <unknown>
 
 vmsgeu.vi v8, v4, 0, v0.t
 # CHECK-INST: vmseq.vv v8, v4, v4, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x42,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 42 60 <unknown>
+# CHECK-UNKNOWN: 60420457 <unknown>
 
 vmsgeu.vi v8, v4, 0
 # CHECK-INST: vmseq.vv v8, v4, v4
 # CHECK-ENCODING: [0x57,0x04,0x42,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 42 62 <unknown>
+# CHECK-UNKNOWN: 62420457 <unknown>
 
 vmsge.vi v8, v4, 16, v0.t
 # CHECK-INST: vmsgt.vi v8, v4, 0xf, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x7c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 7c <unknown>
+# CHECK-UNKNOWN: 7c47b457 <unknown>
 
 vmsge.vi v8, v4, 16
 # CHECK-INST: vmsgt.vi v8, v4, 0xf
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x7e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 7e <unknown>
+# CHECK-UNKNOWN: 7e47b457 <unknown>
 
 vmsgeu.vx v8, v4, a0
 # CHECK-INST: vmsltu.vx v8, v4, a0
@@ -380,8 +380,8 @@ vmsgeu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x6a]
 # CHECK-ENCODING: [0x57,0x24,0x84,0x76]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 6a <unknown>
-# CHECK-UNKNOWN: 57 24 84 76 <unknown>
+# CHECK-UNKNOWN: 6a454457 <unknown>
+# CHECK-UNKNOWN: 76842457 <unknown>
 
 vmsge.vx v0, v4, a0
 # CHECK-INST: vmslt.vx v0, v4, a0
@@ -389,8 +389,8 @@ vmsge.vx v0, v4, a0
 # CHECK-ENCODING: [0x57,0x40,0x45,0x6e]
 # CHECK-ENCODING: [0x57,0x20,0x00,0x76]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 40 45 6e <unknown>
-# CHECK-UNKNOWN: 57 20 00 76 <unknown>
+# CHECK-UNKNOWN: 6e454057 <unknown>
+# CHECK-UNKNOWN: 76002057 <unknown>
 
 vmsge.vx v8, v4, a0
 # CHECK-INST: vmslt.vx v8, v4, a0
@@ -398,8 +398,8 @@ vmsge.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x6e]
 # CHECK-ENCODING: [0x57,0x24,0x84,0x76]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 6e <unknown>
-# CHECK-UNKNOWN: 57 24 84 76 <unknown>
+# CHECK-UNKNOWN: 6e454457 <unknown>
+# CHECK-UNKNOWN: 76842457 <unknown>
 
 vmsgeu.vx v8, v4, a0, v0.t
 # CHECK-INST: vmsltu.vx v8, v4, a0, v0.t
@@ -407,8 +407,8 @@ vmsgeu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x68]
 # CHECK-ENCODING: [0x57,0x24,0x80,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 68 <unknown>
-# CHECK-UNKNOWN: 57 24 80 6e <unknown>
+# CHECK-UNKNOWN: 68454457 <unknown>
+# CHECK-UNKNOWN: 6e802457 <unknown>
 
 vmsge.vx v8, v4, a0, v0.t
 # CHECK-INST: vmslt.vx v8, v4, a0, v0.t
@@ -416,8 +416,8 @@ vmsge.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x6c]
 # CHECK-ENCODING: [0x57,0x24,0x80,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 6c <unknown>
-# CHECK-UNKNOWN: 57 24 80 6e <unknown>
+# CHECK-UNKNOWN: 6c454457 <unknown>
+# CHECK-UNKNOWN: 6e802457 <unknown>
 
 vmsgeu.vx v0, v4, a0, v0.t, v2
 # CHECK-INST: vmsltu.vx v2, v4, a0
@@ -425,8 +425,8 @@ vmsgeu.vx v0, v4, a0, v0.t, v2
 # CHECK-ENCODING: [0x57,0x41,0x45,0x6a]
 # CHECK-ENCODING: [0x57,0x20,0x01,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 41 45 6a <unknown>
-# CHECK-UNKNOWN: 57 20 01 62 <unknown>
+# CHECK-UNKNOWN: 6a454157 <unknown>
+# CHECK-UNKNOWN: 62012057 <unknown>
 
 vmsge.vx v0, v4, a0, v0.t, v2
 # CHECK-INST: vmslt.vx v2, v4, a0
@@ -434,8 +434,8 @@ vmsge.vx v0, v4, a0, v0.t, v2
 # CHECK-ENCODING: [0x57,0x41,0x45,0x6e]
 # CHECK-ENCODING: [0x57,0x20,0x01,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 41 45 6e <unknown>
-# CHECK-UNKNOWN: 57 20 01 62 <unknown>
+# CHECK-UNKNOWN: 6e454157 <unknown>
+# CHECK-UNKNOWN: 62012057 <unknown>
 
 vmsgeu.vx v9, v4, a0, v0.t, v2
 # CHECK-INST: vmsltu.vx v2, v4, a0
@@ -447,10 +447,10 @@ vmsgeu.vx v9, v4, a0, v0.t, v2
 # CHECK-ENCODING: [0xd7,0x24,0x90,0x62]
 # CHECK-ENCODING: [0xd7,0xa4,0x24,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 41 45 6a <unknown>
-# CHECK-UNKNOWN: 57 21 01 62 <unknown>
-# CHECK-UNKNOWN: d7 24 90 62 <unknown>
-# CHECK-UNKNOWN: d7 a4 24 6a <unknown>
+# CHECK-UNKNOWN: 6a454157 <unknown>
+# CHECK-UNKNOWN: 62012157 <unknown>
+# CHECK-UNKNOWN: 629024d7 <unknown>
+# CHECK-UNKNOWN: 6a24a4d7 <unknown>
 
 vmsge.vx v8, v4, a0, v0.t, v2
 # CHECK-INST: vmslt.vx v2, v4, a0
@@ -462,7 +462,7 @@ vmsge.vx v8, v4, a0, v0.t, v2
 # CHECK-ENCODING: [0x57,0x24,0x80,0x62]
 # CHECK-ENCODING: [0x57,0x24,0x24,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 41 45 6e <unknown>
-# CHECK-UNKNOWN: 57 21 01 62 <unknown>
-# CHECK-UNKNOWN: 57 24 80 62 <unknown>
-# CHECK-UNKNOWN: 57 24 24 6a <unknown>
+# CHECK-UNKNOWN: 6e454157 <unknown>
+# CHECK-UNKNOWN: 62012157 <unknown>
+# CHECK-UNKNOWN: 62802457 <unknown>
+# CHECK-UNKNOWN: 6a242457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/convert.s b/llvm/test/MC/RISCV/rvv/convert.s
index 28c0a0fa837e..269c86cef473 100644
--- a/llvm/test/MC/RISCV/rvv/convert.s
+++ b/llvm/test/MC/RISCV/rvv/convert.s
@@ -15,256 +15,256 @@ vfcvt.xu.f.v v8, v4, v0.t
 # CHECK-INST: vfcvt.xu.f.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x40,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 40 48 <unknown>
+# CHECK-UNKNOWN: 48401457 <unknown>
 
 vfcvt.xu.f.v v8, v4
 # CHECK-INST: vfcvt.xu.f.v v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x40,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 40 4a <unknown>
+# CHECK-UNKNOWN: 4a401457 <unknown>
 
 vfcvt.x.f.v v8, v4, v0.t
 # CHECK-INST: vfcvt.x.f.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x40,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 40 48 <unknown>
+# CHECK-UNKNOWN: 48409457 <unknown>
 
 vfcvt.x.f.v v8, v4
 # CHECK-INST: vfcvt.x.f.v v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x40,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 40 4a <unknown>
+# CHECK-UNKNOWN: 4a409457 <unknown>
 
 vfcvt.f.xu.v v8, v4, v0.t
 # CHECK-INST: vfcvt.f.xu.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x41,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 41 48 <unknown>
+# CHECK-UNKNOWN: 48411457 <unknown>
 
 vfcvt.f.xu.v v8, v4
 # CHECK-INST: vfcvt.f.xu.v v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x41,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 41 4a <unknown>
+# CHECK-UNKNOWN: 4a411457 <unknown>
 
 vfcvt.f.x.v v8, v4, v0.t
 # CHECK-INST: vfcvt.f.x.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x41,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 41 48 <unknown>
+# CHECK-UNKNOWN: 48419457 <unknown>
 
 vfcvt.f.x.v v8, v4
 # CHECK-INST: vfcvt.f.x.v v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x41,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 41 4a <unknown>
+# CHECK-UNKNOWN: 4a419457 <unknown>
 
 vfcvt.rtz.xu.f.v v8, v4, v0.t
 # CHECK-INST: vfcvt.rtz.xu.f.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x43,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 43 48 <unknown>
+# CHECK-UNKNOWN: 48431457 <unknown>
 
 vfcvt.rtz.xu.f.v v8, v4
 # CHECK-INST: vfcvt.rtz.xu.f.v v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x43,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 43 4a <unknown>
+# CHECK-UNKNOWN: 4a431457 <unknown>
 
 vfcvt.rtz.x.f.v v8, v4, v0.t
 # CHECK-INST: vfcvt.rtz.x.f.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x43,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 43 48 <unknown>
+# CHECK-UNKNOWN: 48439457 <unknown>
 
 vfcvt.rtz.x.f.v v8, v4
 # CHECK-INST: vfcvt.rtz.x.f.v v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x43,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 43 4a <unknown>
+# CHECK-UNKNOWN: 4a439457 <unknown>
 
 vfwcvt.xu.f.v v8, v4, v0.t
 # CHECK-INST: vfwcvt.xu.f.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x44,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 44 48 <unknown>
+# CHECK-UNKNOWN: 48441457 <unknown>
 
 vfwcvt.xu.f.v v8, v4
 # CHECK-INST: vfwcvt.xu.f.v v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x44,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 44 4a <unknown>
+# CHECK-UNKNOWN: 4a441457 <unknown>
 
 vfwcvt.x.f.v v8, v4, v0.t
 # CHECK-INST: vfwcvt.x.f.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x44,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 44 48 <unknown>
+# CHECK-UNKNOWN: 48449457 <unknown>
 
 vfwcvt.x.f.v v8, v4
 # CHECK-INST: vfwcvt.x.f.v v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x44,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 44 4a <unknown>
+# CHECK-UNKNOWN: 4a449457 <unknown>
 
 vfwcvt.f.xu.v v8, v4, v0.t
 # CHECK-INST: vfwcvt.f.xu.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x45,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 45 48 <unknown>
+# CHECK-UNKNOWN: 48451457 <unknown>
 
 vfwcvt.f.xu.v v8, v4
 # CHECK-INST: vfwcvt.f.xu.v v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x45,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 45 4a <unknown>
+# CHECK-UNKNOWN: 4a451457 <unknown>
 
 vfwcvt.f.x.v v8, v4, v0.t
 # CHECK-INST: vfwcvt.f.x.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x45,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 45 48 <unknown>
+# CHECK-UNKNOWN: 48459457 <unknown>
 
 vfwcvt.f.x.v v8, v4
 # CHECK-INST: vfwcvt.f.x.v v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x45,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 45 4a <unknown>
+# CHECK-UNKNOWN: 4a459457 <unknown>
 
 vfwcvt.f.f.v v8, v4, v0.t
 # CHECK-INST: vfwcvt.f.f.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x46,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 46 48 <unknown>
+# CHECK-UNKNOWN: 48461457 <unknown>
 
 vfwcvt.f.f.v v8, v4
 # CHECK-INST: vfwcvt.f.f.v v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x46,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 46 4a <unknown>
+# CHECK-UNKNOWN: 4a461457 <unknown>
 
 vfwcvt.rtz.xu.f.v v8, v4, v0.t
 # CHECK-INST: vfwcvt.rtz.xu.f.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x47,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 47 48 <unknown>
+# CHECK-UNKNOWN: 48471457 <unknown>
 
 vfwcvt.rtz.xu.f.v v8, v4
 # CHECK-INST: vfwcvt.rtz.xu.f.v v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x47,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 47 4a <unknown>
+# CHECK-UNKNOWN: 4a471457 <unknown>
 
 vfwcvt.rtz.x.f.v v8, v4, v0.t
 # CHECK-INST: vfwcvt.rtz.x.f.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x47,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 47 48 <unknown>
+# CHECK-UNKNOWN: 48479457 <unknown>
 
 vfwcvt.rtz.x.f.v v8, v4
 # CHECK-INST: vfwcvt.rtz.x.f.v v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x47,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 47 4a <unknown>
+# CHECK-UNKNOWN: 4a479457 <unknown>
 
 vfncvt.xu.f.w v8, v4, v0.t
 # CHECK-INST: vfncvt.xu.f.w v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x48,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 48 48 <unknown>
+# CHECK-UNKNOWN: 48481457 <unknown>
 
 vfncvt.xu.f.w v4, v4, v0.t
 # CHECK-INST: vfncvt.xu.f.w v4, v4, v0.t
 # CHECK-ENCODING: [0x57,0x12,0x48,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 12 48 48 <unknown>
+# CHECK-UNKNOWN: 48481257 <unknown>
 
 vfncvt.xu.f.w v8, v4
 # CHECK-INST: vfncvt.xu.f.w v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x48,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 48 4a <unknown>
+# CHECK-UNKNOWN: 4a481457 <unknown>
 
 vfncvt.x.f.w v8, v4, v0.t
 # CHECK-INST: vfncvt.x.f.w v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x48,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 48 48 <unknown>
+# CHECK-UNKNOWN: 48489457 <unknown>
 
 vfncvt.x.f.w v8, v4
 # CHECK-INST: vfncvt.x.f.w v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x48,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 48 4a <unknown>
+# CHECK-UNKNOWN: 4a489457 <unknown>
 
 vfncvt.f.xu.w v8, v4, v0.t
 # CHECK-INST: vfncvt.f.xu.w v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x49,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 49 48 <unknown>
+# CHECK-UNKNOWN: 48491457 <unknown>
 
 vfncvt.f.xu.w v8, v4
 # CHECK-INST: vfncvt.f.xu.w v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x49,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 49 4a <unknown>
+# CHECK-UNKNOWN: 4a491457 <unknown>
 
 vfncvt.f.x.w v8, v4, v0.t
 # CHECK-INST: vfncvt.f.x.w v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x49,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 49 48 <unknown>
+# CHECK-UNKNOWN: 48499457 <unknown>
 
 vfncvt.f.x.w v8, v4
 # CHECK-INST: vfncvt.f.x.w v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x49,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 49 4a <unknown>
+# CHECK-UNKNOWN: 4a499457 <unknown>
 
 vfncvt.f.f.w v8, v4, v0.t
 # CHECK-INST: vfncvt.f.f.w v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 48 <unknown>
+# CHECK-UNKNOWN: 484a1457 <unknown>
 
 vfncvt.f.f.w v8, v4
 # CHECK-INST: vfncvt.f.f.w v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 4a <unknown>
+# CHECK-UNKNOWN: 4a4a1457 <unknown>
 
 vfncvt.rod.f.f.w v8, v4, v0.t
 # CHECK-INST: vfncvt.rod.f.f.w v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x4a,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 4a 48 <unknown>
+# CHECK-UNKNOWN: 484a9457 <unknown>
 
 vfncvt.rod.f.f.w v8, v4
 # CHECK-INST: vfncvt.rod.f.f.w v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x4a,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 4a 4a <unknown>
+# CHECK-UNKNOWN: 4a4a9457 <unknown>
 
 vfncvt.rtz.xu.f.w v8, v4, v0.t
 # CHECK-INST: vfncvt.rtz.xu.f.w v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4b,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4b 48 <unknown>
+# CHECK-UNKNOWN: 484b1457 <unknown>
 
 vfncvt.rtz.xu.f.w v8, v4
 # CHECK-INST: vfncvt.rtz.xu.f.w v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x4b,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4b 4a <unknown>
+# CHECK-UNKNOWN: 4a4b1457 <unknown>
 
 vfncvt.rtz.x.f.w v8, v4, v0.t
 # CHECK-INST: vfncvt.rtz.x.f.w v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x4b,0x48]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 4b 48 <unknown>
+# CHECK-UNKNOWN: 484b9457 <unknown>
 
 vfncvt.rtz.x.f.w v8, v4
 # CHECK-INST: vfncvt.rtz.x.f.w v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x4b,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 4b 4a <unknown>
+# CHECK-UNKNOWN: 4a4b9457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/div.s b/llvm/test/MC/RISCV/rvv/div.s
index 229124c671c6..aca04375f61e 100644
--- a/llvm/test/MC/RISCV/rvv/div.s
+++ b/llvm/test/MC/RISCV/rvv/div.s
@@ -12,94 +12,94 @@ vdivu.vv v8, v4, v20, v0.t
 # CHECK-INST: vdivu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 80 <unknown>
+# CHECK-UNKNOWN: 804a2457 <unknown>
 
 vdivu.vv v8, v4, v20
 # CHECK-INST: vdivu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 82 <unknown>
+# CHECK-UNKNOWN: 824a2457 <unknown>
 
 vdivu.vx v8, v4, a0, v0.t
 # CHECK-INST: vdivu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 80 <unknown>
+# CHECK-UNKNOWN: 80456457 <unknown>
 
 vdivu.vx v8, v4, a0
 # CHECK-INST: vdivu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 82 <unknown>
+# CHECK-UNKNOWN: 82456457 <unknown>
 
 vdiv.vv v8, v4, v20, v0.t
 # CHECK-INST: vdiv.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 84 <unknown>
+# CHECK-UNKNOWN: 844a2457 <unknown>
 
 vdiv.vv v8, v4, v20
 # CHECK-INST: vdiv.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 86 <unknown>
+# CHECK-UNKNOWN: 864a2457 <unknown>
 
 vdiv.vx v8, v4, a0, v0.t
 # CHECK-INST: vdiv.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 84 <unknown>
+# CHECK-UNKNOWN: 84456457 <unknown>
 
 vdiv.vx v8, v4, a0
 # CHECK-INST: vdiv.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 86 <unknown>
+# CHECK-UNKNOWN: 86456457 <unknown>
 
 vremu.vv v8, v4, v20, v0.t
 # CHECK-INST: vremu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 88 <unknown>
+# CHECK-UNKNOWN: 884a2457 <unknown>
 
 vremu.vv v8, v4, v20
 # CHECK-INST: vremu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 8a <unknown>
+# CHECK-UNKNOWN: 8a4a2457 <unknown>
 
 vremu.vx v8, v4, a0, v0.t
 # CHECK-INST: vremu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 88 <unknown>
+# CHECK-UNKNOWN: 88456457 <unknown>
 
 vremu.vx v8, v4, a0
 # CHECK-INST: vremu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 8a <unknown>
+# CHECK-UNKNOWN: 8a456457 <unknown>
 
 vrem.vv v8, v4, v20, v0.t
 # CHECK-INST: vrem.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 8c <unknown>
+# CHECK-UNKNOWN: 8c4a2457 <unknown>
 
 vrem.vv v8, v4, v20
 # CHECK-INST: vrem.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 8e <unknown>
+# CHECK-UNKNOWN: 8e4a2457 <unknown>
 
 vrem.vx v8, v4, a0, v0.t
 # CHECK-INST: vrem.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 8c <unknown>
+# CHECK-UNKNOWN: 8c456457 <unknown>
 
 vrem.vx v8, v4, a0
 # CHECK-INST: vrem.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 8e <unknown>
+# CHECK-UNKNOWN: 8e456457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/ext.s b/llvm/test/MC/RISCV/rvv/ext.s
index 80cadc096104..3bf1351d77a7 100644
--- a/llvm/test/MC/RISCV/rvv/ext.s
+++ b/llvm/test/MC/RISCV/rvv/ext.s
@@ -12,70 +12,70 @@ vzext.vf2 v8, v4, v0.t
 # CHECK-INST: vzext.vf2 v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x43,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 43 48 <unknown>
+# CHECK-UNKNOWN: 48432457 <unknown>
 
 vzext.vf2 v8, v4
 # CHECK-INST: vzext.vf2 v8, v4
 # CHECK-ENCODING: [0x57,0x24,0x43,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 43 4a <unknown>
+# CHECK-UNKNOWN: 4a432457 <unknown>
 
 vsext.vf2 v8, v4, v0.t
 # CHECK-INST: vsext.vf2 v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0xa4,0x43,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 43 48 <unknown>
+# CHECK-UNKNOWN: 4843a457 <unknown>
 
 vsext.vf2 v8, v4
 # CHECK-INST: vsext.vf2 v8, v4
 # CHECK-ENCODING: [0x57,0xa4,0x43,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 43 4a <unknown>
+# CHECK-UNKNOWN: 4a43a457 <unknown>
 
 vzext.vf4 v8, v4, v0.t
 # CHECK-INST: vzext.vf4 v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x42,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 42 48 <unknown>
+# CHECK-UNKNOWN: 48422457 <unknown>
 
 vzext.vf4 v8, v4
 # CHECK-INST: vzext.vf4 v8, v4
 # CHECK-ENCODING: [0x57,0x24,0x42,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 42 4a <unknown>
+# CHECK-UNKNOWN: 4a422457 <unknown>
 
 vsext.vf4 v8, v4, v0.t
 # CHECK-INST: vsext.vf4 v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0xa4,0x42,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 42 48 <unknown>
+# CHECK-UNKNOWN: 4842a457 <unknown>
 
 vsext.vf4 v8, v4
 # CHECK-INST: vsext.vf4 v8, v4
 # CHECK-ENCODING: [0x57,0xa4,0x42,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 42 4a <unknown>
+# CHECK-UNKNOWN: 4a42a457 <unknown>
 
 vzext.vf8 v8, v4, v0.t
 # CHECK-INST: vzext.vf8 v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x41,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 41 48 <unknown>
+# CHECK-UNKNOWN: 48412457 <unknown>
 
 vzext.vf8 v8, v4
 # CHECK-INST: vzext.vf8 v8, v4
 # CHECK-ENCODING: [0x57,0x24,0x41,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 41 4a <unknown>
+# CHECK-UNKNOWN: 4a412457 <unknown>
 
 vsext.vf8 v8, v4, v0.t
 # CHECK-INST: vsext.vf8 v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0xa4,0x41,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 41 48 <unknown>
+# CHECK-UNKNOWN: 4841a457 <unknown>
 
 vsext.vf8 v8, v4
 # CHECK-INST: vsext.vf8 v8, v4
 # CHECK-ENCODING: [0x57,0xa4,0x41,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 41 4a <unknown>
+# CHECK-UNKNOWN: 4a41a457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/fadd.s b/llvm/test/MC/RISCV/rvv/fadd.s
index 60ffaf62ca6b..890b2c0ad68b 100644
--- a/llvm/test/MC/RISCV/rvv/fadd.s
+++ b/llvm/test/MC/RISCV/rvv/fadd.s
@@ -15,70 +15,70 @@ vfadd.vv v8, v4, v20, v0.t
 # CHECK-INST: vfadd.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x00]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 00 <unknown>
+# CHECK-UNKNOWN: 004a1457 <unknown>
 
 vfadd.vv v8, v4, v20
 # CHECK-INST: vfadd.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x02]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 02 <unknown>
+# CHECK-UNKNOWN: 024a1457 <unknown>
 
 vfadd.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfadd.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x00]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 00 <unknown>
+# CHECK-UNKNOWN: 00455457 <unknown>
 
 vfadd.vf v8, v4, fa0
 # CHECK-INST: vfadd.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x02]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 02 <unknown>
+# CHECK-UNKNOWN: 02455457 <unknown>
 
 vfwadd.vv v8, v4, v20, v0.t
 # CHECK-INST: vfwadd.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a c0 <unknown>
+# CHECK-UNKNOWN: c04a1457 <unknown>
 
 vfwadd.vv v8, v4, v20
 # CHECK-INST: vfwadd.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a c2 <unknown>
+# CHECK-UNKNOWN: c24a1457 <unknown>
 
 vfwadd.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfwadd.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 c0 <unknown>
+# CHECK-UNKNOWN: c0455457 <unknown>
 
 vfwadd.vf v8, v4, fa0
 # CHECK-INST: vfwadd.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 c2 <unknown>
+# CHECK-UNKNOWN: c2455457 <unknown>
 
 vfwadd.wv v8, v4, v20, v0.t
 # CHECK-INST: vfwadd.wv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xd0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a d0 <unknown>
+# CHECK-UNKNOWN: d04a1457 <unknown>
 
 vfwadd.wv v8, v4, v20
 # CHECK-INST: vfwadd.wv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xd2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a d2 <unknown>
+# CHECK-UNKNOWN: d24a1457 <unknown>
 
 vfwadd.wf v8, v4, fa0, v0.t
 # CHECK-INST: vfwadd.wf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xd0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 d0 <unknown>
+# CHECK-UNKNOWN: d0455457 <unknown>
 
 vfwadd.wf v8, v4, fa0
 # CHECK-INST: vfwadd.wf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0xd2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 d2 <unknown>
+# CHECK-UNKNOWN: d2455457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/fcompare.s b/llvm/test/MC/RISCV/rvv/fcompare.s
index 11dd7e05467b..3903bbdab650 100644
--- a/llvm/test/MC/RISCV/rvv/fcompare.s
+++ b/llvm/test/MC/RISCV/rvv/fcompare.s
@@ -15,148 +15,148 @@ vmfeq.vv v8, v4, v20, v0.t
 # CHECK-INST: vmfeq.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x60]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 60 <unknown>
+# CHECK-UNKNOWN: 604a1457 <unknown>
 
 vmfeq.vv v8, v4, v20
 # CHECK-INST: vmfeq.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x62]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 62 <unknown>
+# CHECK-UNKNOWN: 624a1457 <unknown>
 
 vmfeq.vf v8, v4, fa0, v0.t
 # CHECK-INST: vmfeq.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x60]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 60 <unknown>
+# CHECK-UNKNOWN: 60455457 <unknown>
 
 vmfeq.vf v8, v4, fa0
 # CHECK-INST: vmfeq.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x62]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 62 <unknown>
+# CHECK-UNKNOWN: 62455457 <unknown>
 
 vmfne.vv v8, v4, v20, v0.t
 # CHECK-INST: vmfne.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x70]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 70 <unknown>
+# CHECK-UNKNOWN: 704a1457 <unknown>
 
 vmfne.vv v8, v4, v20
 # CHECK-INST: vmfne.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x72]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 72 <unknown>
+# CHECK-UNKNOWN: 724a1457 <unknown>
 
 vmfne.vf v8, v4, fa0, v0.t
 # CHECK-INST: vmfne.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x70]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 70 <unknown>
+# CHECK-UNKNOWN: 70455457 <unknown>
 
 vmfne.vf v8, v4, fa0
 # CHECK-INST: vmfne.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x72]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 72 <unknown>
+# CHECK-UNKNOWN: 72455457 <unknown>
 
 vmflt.vv v8, v4, v20, v0.t
 # CHECK-INST: vmflt.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 6c <unknown>
+# CHECK-UNKNOWN: 6c4a1457 <unknown>
 
 vmflt.vv v8, v4, v20
 # CHECK-INST: vmflt.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 6e <unknown>
+# CHECK-UNKNOWN: 6e4a1457 <unknown>
 
 vmflt.vf v8, v4, fa0, v0.t
 # CHECK-INST: vmflt.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 6c <unknown>
+# CHECK-UNKNOWN: 6c455457 <unknown>
 
 vmflt.vf v8, v4, fa0
 # CHECK-INST: vmflt.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 6e <unknown>
+# CHECK-UNKNOWN: 6e455457 <unknown>
 
 vmfle.vv v8, v4, v20, v0.t
 # CHECK-INST: vmfle.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x64]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 64 <unknown>
+# CHECK-UNKNOWN: 644a1457 <unknown>
 
 vmfle.vv v8, v4, v20
 # CHECK-INST: vmfle.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x66]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 66 <unknown>
+# CHECK-UNKNOWN: 664a1457 <unknown>
 
 vmfle.vf v8, v4, fa0, v0.t
 # CHECK-INST: vmfle.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x64]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 64 <unknown>
+# CHECK-UNKNOWN: 64455457 <unknown>
 
 vmfle.vf v8, v4, fa0
 # CHECK-INST: vmfle.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x66]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 66 <unknown>
+# CHECK-UNKNOWN: 66455457 <unknown>
 
 vmfgt.vf v8, v4, fa0, v0.t
 # CHECK-INST: vmfgt.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x74]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 74 <unknown>
+# CHECK-UNKNOWN: 74455457 <unknown>
 
 vmfgt.vf v8, v4, fa0
 # CHECK-INST: vmfgt.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x76]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 76 <unknown>
+# CHECK-UNKNOWN: 76455457 <unknown>
 
 vmfge.vf v8, v4, fa0, v0.t
 # CHECK-INST: vmfge.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x7c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 7c <unknown>
+# CHECK-UNKNOWN: 7c455457 <unknown>
 
 vmfge.vf v8, v4, fa0
 # CHECK-INST: vmfge.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x7e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 7e <unknown>
+# CHECK-UNKNOWN: 7e455457 <unknown>
 
 vmfgt.vv v8, v20, v4, v0.t
 # CHECK-INST: vmflt.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 6c <unknown>
+# CHECK-UNKNOWN: 6c4a1457 <unknown>
 
 vmfgt.vv v8, v20, v4
 # CHECK-INST: vmflt.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 6e <unknown>
+# CHECK-UNKNOWN: 6e4a1457 <unknown>
 
 vmfge.vv v8, v20, v4, v0.t
 # CHECK-INST: vmfle.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x64]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 64 <unknown>
+# CHECK-UNKNOWN: 644a1457 <unknown>
 
 vmfge.vv v8, v20, v4
 # CHECK-INST: vmfle.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x66]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 66 <unknown>
+# CHECK-UNKNOWN: 664a1457 <unknown>
 
 vmfeq.vv v0, v4, v20, v0.t
 # CHECK-INST: vmfeq.vv v0, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x10,0x4a,0x60]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 10 4a 60 <unknown>
+# CHECK-UNKNOWN: 604a1057 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/fdiv.s b/llvm/test/MC/RISCV/rvv/fdiv.s
index 7eb048f3cd58..aa3aae5841a2 100644
--- a/llvm/test/MC/RISCV/rvv/fdiv.s
+++ b/llvm/test/MC/RISCV/rvv/fdiv.s
@@ -15,34 +15,34 @@ vfdiv.vv v8, v4, v20, v0.t
 # CHECK-INST: vfdiv.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x80]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 80 <unknown>
+# CHECK-UNKNOWN: 804a1457 <unknown>
 
 vfdiv.vv v8, v4, v20
 # CHECK-INST: vfdiv.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x82]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 82 <unknown>
+# CHECK-UNKNOWN: 824a1457 <unknown>
 
 vfdiv.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfdiv.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x80]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 80 <unknown>
+# CHECK-UNKNOWN: 80455457 <unknown>
 
 vfdiv.vf v8, v4, fa0
 # CHECK-INST: vfdiv.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x82]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 82 <unknown>
+# CHECK-UNKNOWN: 82455457 <unknown>
 
 vfrdiv.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfrdiv.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 84 <unknown>
+# CHECK-UNKNOWN: 84455457 <unknown>
 
 vfrdiv.vf v8, v4, fa0
 # CHECK-INST: vfrdiv.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 86 <unknown>
+# CHECK-UNKNOWN: 86455457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/fmacc.s b/llvm/test/MC/RISCV/rvv/fmacc.s
index 129455901b93..8ca43da80961 100644
--- a/llvm/test/MC/RISCV/rvv/fmacc.s
+++ b/llvm/test/MC/RISCV/rvv/fmacc.s
@@ -15,286 +15,286 @@ vfmacc.vv v8, v20, v4, v0.t
 # CHECK-INST: vfmacc.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xb0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a b0 <unknown>
+# CHECK-UNKNOWN: b04a1457 <unknown>
 
 vfmacc.vv v8, v20, v4
 # CHECK-INST: vfmacc.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xb2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a b2 <unknown>
+# CHECK-UNKNOWN: b24a1457 <unknown>
 
 vfmacc.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfmacc.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xb0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 b0 <unknown>
+# CHECK-UNKNOWN: b0455457 <unknown>
 
 vfmacc.vf v8, fa0, v4
 # CHECK-INST: vfmacc.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xb2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 b2 <unknown>
+# CHECK-UNKNOWN: b2455457 <unknown>
 
 vfnmacc.vv v8, v20, v4, v0.t
 # CHECK-INST: vfnmacc.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xb4]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a b4 <unknown>
+# CHECK-UNKNOWN: b44a1457 <unknown>
 
 vfnmacc.vv v8, v20, v4
 # CHECK-INST: vfnmacc.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xb6]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a b6 <unknown>
+# CHECK-UNKNOWN: b64a1457 <unknown>
 
 vfnmacc.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfnmacc.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xb4]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 b4 <unknown>
+# CHECK-UNKNOWN: b4455457 <unknown>
 
 vfnmacc.vf v8, fa0, v4
 # CHECK-INST: vfnmacc.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xb6]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 b6 <unknown>
+# CHECK-UNKNOWN: b6455457 <unknown>
 
 vfmsac.vv v8, v20, v4, v0.t
 # CHECK-INST: vfmsac.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xb8]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a b8 <unknown>
+# CHECK-UNKNOWN: b84a1457 <unknown>
 
 vfmsac.vv v8, v20, v4
 # CHECK-INST: vfmsac.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xba]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a ba <unknown>
+# CHECK-UNKNOWN: ba4a1457 <unknown>
 
 vfmsac.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfmsac.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xb8]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 b8 <unknown>
+# CHECK-UNKNOWN: b8455457 <unknown>
 
 vfmsac.vf v8, fa0, v4
 # CHECK-INST: vfmsac.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xba]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 ba <unknown>
+# CHECK-UNKNOWN: ba455457 <unknown>
 
 vfnmsac.vv v8, v20, v4, v0.t
 # CHECK-INST: vfnmsac.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xbc]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a bc <unknown>
+# CHECK-UNKNOWN: bc4a1457 <unknown>
 
 vfnmsac.vv v8, v20, v4
 # CHECK-INST: vfnmsac.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xbe]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a be <unknown>
+# CHECK-UNKNOWN: be4a1457 <unknown>
 
 vfnmsac.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfnmsac.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xbc]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 bc <unknown>
+# CHECK-UNKNOWN: bc455457 <unknown>
 
 vfnmsac.vf v8, fa0, v4
 # CHECK-INST: vfnmsac.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xbe]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 be <unknown>
+# CHECK-UNKNOWN: be455457 <unknown>
 
 vfmadd.vv v8, v20, v4, v0.t
 # CHECK-INST: vfmadd.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a a0 <unknown>
+# CHECK-UNKNOWN: a04a1457 <unknown>
 
 vfmadd.vv v8, v20, v4
 # CHECK-INST: vfmadd.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a a2 <unknown>
+# CHECK-UNKNOWN: a24a1457 <unknown>
 
 vfmadd.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfmadd.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 a0 <unknown>
+# CHECK-UNKNOWN: a0455457 <unknown>
 
 vfmadd.vf v8, fa0, v4
 # CHECK-INST: vfmadd.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 a2 <unknown>
+# CHECK-UNKNOWN: a2455457 <unknown>
 
 vfnmadd.vv v8, v20, v4, v0.t
 # CHECK-INST: vfnmadd.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a a4 <unknown>
+# CHECK-UNKNOWN: a44a1457 <unknown>
 
 vfnmadd.vv v8, v20, v4
 # CHECK-INST: vfnmadd.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a a6 <unknown>
+# CHECK-UNKNOWN: a64a1457 <unknown>
 
 vfnmadd.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfnmadd.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 a4 <unknown>
+# CHECK-UNKNOWN: a4455457 <unknown>
 
 vfnmadd.vf v8, fa0, v4
 # CHECK-INST: vfnmadd.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 a6 <unknown>
+# CHECK-UNKNOWN: a6455457 <unknown>
 
 vfmsub.vv v8, v20, v4, v0.t
 # CHECK-INST: vfmsub.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a a8 <unknown>
+# CHECK-UNKNOWN: a84a1457 <unknown>
 
 vfmsub.vv v8, v20, v4
 # CHECK-INST: vfmsub.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a aa <unknown>
+# CHECK-UNKNOWN: aa4a1457 <unknown>
 
 vfmsub.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfmsub.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 a8 <unknown>
+# CHECK-UNKNOWN: a8455457 <unknown>
 
 vfmsub.vf v8, fa0, v4
 # CHECK-INST: vfmsub.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 aa <unknown>
+# CHECK-UNKNOWN: aa455457 <unknown>
 
 vfnmsub.vv v8, v20, v4, v0.t
 # CHECK-INST: vfnmsub.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xac]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a ac <unknown>
+# CHECK-UNKNOWN: ac4a1457 <unknown>
 
 vfnmsub.vv v8, v20, v4
 # CHECK-INST: vfnmsub.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xae]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a ae <unknown>
+# CHECK-UNKNOWN: ae4a1457 <unknown>
 
 vfnmsub.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfnmsub.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 ac <unknown>
+# CHECK-UNKNOWN: ac455457 <unknown>
 
 vfnmsub.vf v8, fa0, v4
 # CHECK-INST: vfnmsub.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 ae <unknown>
+# CHECK-UNKNOWN: ae455457 <unknown>
 
 vfwmacc.vv v8, v20, v4, v0.t
 # CHECK-INST: vfwmacc.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xf0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a f0 <unknown>
+# CHECK-UNKNOWN: f04a1457 <unknown>
 
 vfwmacc.vv v8, v20, v4
 # CHECK-INST: vfwmacc.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xf2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a f2 <unknown>
+# CHECK-UNKNOWN: f24a1457 <unknown>
 
 vfwmacc.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfwmacc.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xf0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 f0 <unknown>
+# CHECK-UNKNOWN: f0455457 <unknown>
 
 vfwmacc.vf v8, fa0, v4
 # CHECK-INST: vfwmacc.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xf2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 f2 <unknown>
+# CHECK-UNKNOWN: f2455457 <unknown>
 
 vfwnmacc.vv v8, v20, v4, v0.t
 # CHECK-INST: vfwnmacc.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xf4]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a f4 <unknown>
+# CHECK-UNKNOWN: f44a1457 <unknown>
 
 vfwnmacc.vv v8, v20, v4
 # CHECK-INST: vfwnmacc.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xf6]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a f6 <unknown>
+# CHECK-UNKNOWN: f64a1457 <unknown>
 
 vfwnmacc.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfwnmacc.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xf4]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 f4 <unknown>
+# CHECK-UNKNOWN: f4455457 <unknown>
 
 vfwnmacc.vf v8, fa0, v4
 # CHECK-INST: vfwnmacc.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xf6]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 f6 <unknown>
+# CHECK-UNKNOWN: f6455457 <unknown>
 
 vfwmsac.vv v8, v20, v4, v0.t
 # CHECK-INST: vfwmsac.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xf8]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a f8 <unknown>
+# CHECK-UNKNOWN: f84a1457 <unknown>
 
 vfwmsac.vv v8, v20, v4
 # CHECK-INST: vfwmsac.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xfa]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a fa <unknown>
+# CHECK-UNKNOWN: fa4a1457 <unknown>
 
 vfwmsac.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfwmsac.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xf8]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 f8 <unknown>
+# CHECK-UNKNOWN: f8455457 <unknown>
 
 vfwmsac.vf v8, fa0, v4
 # CHECK-INST: vfwmsac.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xfa]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 fa <unknown>
+# CHECK-UNKNOWN: fa455457 <unknown>
 
 vfwnmsac.vv v8, v20, v4, v0.t
 # CHECK-INST: vfwnmsac.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xfc]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a fc <unknown>
+# CHECK-UNKNOWN: fc4a1457 <unknown>
 
 vfwnmsac.vv v8, v20, v4
 # CHECK-INST: vfwnmsac.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xfe]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a fe <unknown>
+# CHECK-UNKNOWN: fe4a1457 <unknown>
 
 vfwnmsac.vf v8, fa0, v4, v0.t
 # CHECK-INST: vfwnmsac.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xfc]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 fc <unknown>
+# CHECK-UNKNOWN: fc455457 <unknown>
 
 vfwnmsac.vf v8, fa0, v4
 # CHECK-INST: vfwnmsac.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xfe]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 fe <unknown>
+# CHECK-UNKNOWN: fe455457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/fminmax.s b/llvm/test/MC/RISCV/rvv/fminmax.s
index c8aab38e1fc8..f7e85ed31c33 100644
--- a/llvm/test/MC/RISCV/rvv/fminmax.s
+++ b/llvm/test/MC/RISCV/rvv/fminmax.s
@@ -15,46 +15,46 @@ vfmin.vv v8, v4, v20, v0.t
 # CHECK-INST: vfmin.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x10]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 10 <unknown>
+# CHECK-UNKNOWN: 104a1457 <unknown>
 
 vfmin.vv v8, v4, v20
 # CHECK-INST: vfmin.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x12]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 12 <unknown>
+# CHECK-UNKNOWN: 124a1457 <unknown>
 
 vfmin.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfmin.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x10]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 10 <unknown>
+# CHECK-UNKNOWN: 10455457 <unknown>
 
 vfmin.vf v8, v4, fa0
 # CHECK-INST: vfmin.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x12]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 12 <unknown>
+# CHECK-UNKNOWN: 12455457 <unknown>
 
 vfmax.vv v8, v4, v20, v0.t
 # CHECK-INST: vfmax.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x18]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 18 <unknown>
+# CHECK-UNKNOWN: 184a1457 <unknown>
 
 vfmax.vv v8, v4, v20
 # CHECK-INST: vfmax.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x1a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 1a <unknown>
+# CHECK-UNKNOWN: 1a4a1457 <unknown>
 
 vfmax.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfmax.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x18]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 18 <unknown>
+# CHECK-UNKNOWN: 18455457 <unknown>
 
 vfmax.vf v8, v4, fa0
 # CHECK-INST: vfmax.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x1a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 1a <unknown>
+# CHECK-UNKNOWN: 1a455457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/fmul.s b/llvm/test/MC/RISCV/rvv/fmul.s
index 42c37932043c..9cd6e5287750 100644
--- a/llvm/test/MC/RISCV/rvv/fmul.s
+++ b/llvm/test/MC/RISCV/rvv/fmul.s
@@ -15,46 +15,46 @@ vfmul.vv v8, v4, v20, v0.t
 # CHECK-INST: vfmul.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x90]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 90 <unknown>
+# CHECK-UNKNOWN: 904a1457 <unknown>
 
 vfmul.vv v8, v4, v20
 # CHECK-INST: vfmul.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x92]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 92 <unknown>
+# CHECK-UNKNOWN: 924a1457 <unknown>
 
 vfmul.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfmul.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x90]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 90 <unknown>
+# CHECK-UNKNOWN: 90455457 <unknown>
 
 vfmul.vf v8, v4, fa0
 # CHECK-INST: vfmul.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x92]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 92 <unknown>
+# CHECK-UNKNOWN: 92455457 <unknown>
 
 vfwmul.vv v8, v4, v20, v0.t
 # CHECK-INST: vfwmul.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a e0 <unknown>
+# CHECK-UNKNOWN: e04a1457 <unknown>
 
 vfwmul.vv v8, v4, v20
 # CHECK-INST: vfwmul.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a e2 <unknown>
+# CHECK-UNKNOWN: e24a1457 <unknown>
 
 vfwmul.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfwmul.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 e0 <unknown>
+# CHECK-UNKNOWN: e0455457 <unknown>
 
 vfwmul.vf v8, v4, fa0
 # CHECK-INST: vfwmul.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 e2 <unknown>
+# CHECK-UNKNOWN: e2455457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/fmv.s b/llvm/test/MC/RISCV/rvv/fmv.s
index a5b814f130fb..2534b5171b3e 100644
--- a/llvm/test/MC/RISCV/rvv/fmv.s
+++ b/llvm/test/MC/RISCV/rvv/fmv.s
@@ -15,16 +15,16 @@ vfmv.v.f v8, fa0
 # CHECK-INST: vfmv.v.f v8, fa0
 # CHECK-ENCODING: [0x57,0x54,0x05,0x5e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 05 5e <unknown>
+# CHECK-UNKNOWN: 5e055457 <unknown>
 
 vfmv.f.s fa0, v4
 # CHECK-INST: vfmv.f.s fa0, v4
 # CHECK-ENCODING: [0x57,0x15,0x40,0x42]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 15 40 42 <unknown>
+# CHECK-UNKNOWN: 42401557 <unknown>
 
 vfmv.s.f v8, fa0
 # CHECK-INST: vfmv.s.f v8, fa0
 # CHECK-ENCODING: [0x57,0x54,0x05,0x42]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 05 42 <unknown>
+# CHECK-UNKNOWN: 42055457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/fothers.s b/llvm/test/MC/RISCV/rvv/fothers.s
index 451f6ca39ccc..997115f96bd9 100644
--- a/llvm/test/MC/RISCV/rvv/fothers.s
+++ b/llvm/test/MC/RISCV/rvv/fothers.s
@@ -13,76 +13,76 @@ vfsqrt.v v8, v4, v0.t
 # CHECK-INST: vfsqrt.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x40,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 40 4c <unknown>
+# CHECK-UNKNOWN: 4c401457 <unknown>
 
 vfsqrt.v v8, v4
 # CHECK-INST: vfsqrt.v v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x40,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 40 4e <unknown>
+# CHECK-UNKNOWN: 4e401457 <unknown>
 
 vfrsqrt7.v v8, v4, v0.t
 # CHECK-INST: vfrsqrt7.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x42,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 42 4c <unknown>
+# CHECK-UNKNOWN: 4c421457 <unknown>
 
 vfrsqrt7.v v8, v4
 # CHECK-INST: vfrsqrt7.v v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x42,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 42 4e <unknown>
+# CHECK-UNKNOWN: 4e421457 <unknown>
 
 vfrec7.v v8, v4, v0.t
 # CHECK-INST: vfrec7.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x42,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 42 4c <unknown>
+# CHECK-UNKNOWN: 4c429457 <unknown>
 
 vfrec7.v v8, v4
 # CHECK-INST: vfrec7.v v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x42,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 94 42 4e <unknown>
+# CHECK-UNKNOWN: 4e429457 <unknown>
 
 vfclass.v v8, v4, v0.t
 # CHECK-INST: vfclass.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x48,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 48 4c <unknown>
+# CHECK-UNKNOWN: 4c481457 <unknown>
 
 vfclass.v v8, v4
 # CHECK-INST: vfclass.v v8, v4
 # CHECK-ENCODING: [0x57,0x14,0x48,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 48 4e <unknown>
+# CHECK-UNKNOWN: 4e481457 <unknown>
 
 vfmerge.vfm v8, v4, fa0, v0
 # CHECK-INST: vfmerge.vfm v8, v4, fa0, v0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x5c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 5c <unknown>
+# CHECK-UNKNOWN: 5c455457 <unknown>
 
 vfslide1up.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfslide1up.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x38]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 38 <unknown>
+# CHECK-UNKNOWN: 38455457 <unknown>
 
 vfslide1up.vf v8, v4, fa0
 # CHECK-INST: vfslide1up.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x3a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 3a <unknown>
+# CHECK-UNKNOWN: 3a455457 <unknown>
 
 vfslide1down.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfslide1down.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x3c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 3c <unknown>
+# CHECK-UNKNOWN: 3c455457 <unknown>
 
 vfslide1down.vf v8, v4, fa0
 # CHECK-INST: vfslide1down.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x3e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 3e <unknown>
+# CHECK-UNKNOWN: 3e455457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/freduction.s b/llvm/test/MC/RISCV/rvv/freduction.s
index fca64372b15f..12326942e6e8 100644
--- a/llvm/test/MC/RISCV/rvv/freduction.s
+++ b/llvm/test/MC/RISCV/rvv/freduction.s
@@ -15,76 +15,76 @@ vfredosum.vs v8, v4, v20, v0.t
 # CHECK-INST: vfredosum.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 0c <unknown>
+# CHECK-UNKNOWN: 0c4a1457 <unknown>
 
 vfredosum.vs v8, v4, v20
 # CHECK-INST: vfredosum.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 0e <unknown>
+# CHECK-UNKNOWN: 0e4a1457 <unknown>
 
 vfredusum.vs v8, v4, v20, v0.t
 # CHECK-INST: vfredusum.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x04]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 04 <unknown>
+# CHECK-UNKNOWN: 044a1457 <unknown>
 
 vfredusum.vs v8, v4, v20
 # CHECK-INST: vfredusum.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x06]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 06 <unknown>
+# CHECK-UNKNOWN: 064a1457 <unknown>
 
 vfredmax.vs v8, v4, v20, v0.t
 # CHECK-INST: vfredmax.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x1c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 1c <unknown>
+# CHECK-UNKNOWN: 1c4a1457 <unknown>
 
 vfredmax.vs v8, v4, v20
 # CHECK-INST: vfredmax.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x1e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 1e <unknown>
+# CHECK-UNKNOWN: 1e4a1457 <unknown>
 
 vfredmin.vs v8, v4, v20, v0.t
 # CHECK-INST: vfredmin.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x14]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 14 <unknown>
+# CHECK-UNKNOWN: 144a1457 <unknown>
 
 vfredmin.vs v8, v4, v20
 # CHECK-INST: vfredmin.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x16]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 16 <unknown>
+# CHECK-UNKNOWN: 164a1457 <unknown>
 
 vfwredosum.vs v8, v4, v20, v0.t
 # CHECK-INST: vfwredosum.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a cc <unknown>
+# CHECK-UNKNOWN: cc4a1457 <unknown>
 
 vfwredosum.vs v8, v4, v20
 # CHECK-INST: vfwredosum.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xce]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a ce <unknown>
+# CHECK-UNKNOWN: ce4a1457 <unknown>
 
 vfwredusum.vs v8, v4, v20, v0.t
 # CHECK-INST: vfwredusum.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a c4 <unknown>
+# CHECK-UNKNOWN: c44a1457 <unknown>
 
 vfwredusum.vs v8, v4, v20
 # CHECK-INST: vfwredusum.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a c6 <unknown>
+# CHECK-UNKNOWN: c64a1457 <unknown>
 
 vfredosum.vs v0, v4, v20, v0.t
 # CHECK-INST: vfredosum.vs v0, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x10,0x4a,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 10 4a 0c <unknown>
+# CHECK-UNKNOWN: 0c4a1057 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/fsub.s b/llvm/test/MC/RISCV/rvv/fsub.s
index a8f2bc6260b9..62ff2e744c78 100644
--- a/llvm/test/MC/RISCV/rvv/fsub.s
+++ b/llvm/test/MC/RISCV/rvv/fsub.s
@@ -15,82 +15,82 @@ vfsub.vv v8, v4, v20, v0.t
 # CHECK-INST: vfsub.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x08]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 08 <unknown>
+# CHECK-UNKNOWN: 084a1457 <unknown>
 
 vfsub.vv v8, v4, v20
 # CHECK-INST: vfsub.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 0a <unknown>
+# CHECK-UNKNOWN: 0a4a1457 <unknown>
 
 vfsub.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfsub.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x08]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 08 <unknown>
+# CHECK-UNKNOWN: 08455457 <unknown>
 
 vfsub.vf v8, v4, fa0
 # CHECK-INST: vfsub.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 0a <unknown>
+# CHECK-UNKNOWN: 0a455457 <unknown>
 
 vfrsub.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfrsub.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x9c]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 9c <unknown>
+# CHECK-UNKNOWN: 9c455457 <unknown>
 
 vfrsub.vf v8, v4, fa0
 # CHECK-INST: vfrsub.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x9e]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 9e <unknown>
+# CHECK-UNKNOWN: 9e455457 <unknown>
 
 vfwsub.vv v8, v4, v20, v0.t
 # CHECK-INST: vfwsub.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a c8 <unknown>
+# CHECK-UNKNOWN: c84a1457 <unknown>
 
 vfwsub.vv v8, v4, v20
 # CHECK-INST: vfwsub.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xca]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a ca <unknown>
+# CHECK-UNKNOWN: ca4a1457 <unknown>
 
 vfwsub.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfwsub.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 c8 <unknown>
+# CHECK-UNKNOWN: c8455457 <unknown>
 
 vfwsub.vf v8, v4, fa0
 # CHECK-INST: vfwsub.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0xca]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 ca <unknown>
+# CHECK-UNKNOWN: ca455457 <unknown>
 
 vfwsub.wv v8, v4, v20, v0.t
 # CHECK-INST: vfwsub.wv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xd8]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a d8 <unknown>
+# CHECK-UNKNOWN: d84a1457 <unknown>
 
 vfwsub.wv v8, v4, v20
 # CHECK-INST: vfwsub.wv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xda]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a da <unknown>
+# CHECK-UNKNOWN: da4a1457 <unknown>
 
 vfwsub.wf v8, v4, fa0, v0.t
 # CHECK-INST: vfwsub.wf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xd8]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 d8 <unknown>
+# CHECK-UNKNOWN: d8455457 <unknown>
 
 vfwsub.wf v8, v4, fa0
 # CHECK-INST: vfwsub.wf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0xda]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 da <unknown>
+# CHECK-UNKNOWN: da455457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/load.s b/llvm/test/MC/RISCV/rvv/load.s
index 23357df88d3f..3c251a3a8d75 100644
--- a/llvm/test/MC/RISCV/rvv/load.s
+++ b/llvm/test/MC/RISCV/rvv/load.s
@@ -12,382 +12,382 @@ vlm.v v0, (a0)
 # CHECK-INST: vlm.v v0, (a0)
 # CHECK-ENCODING: [0x07,0x00,0xb5,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 00 b5 02 <unknown>
+# CHECK-UNKNOWN: 02b50007 <unknown>
 
 vlm.v v8, (a0)
 # CHECK-INST: vlm.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 02 <unknown>
+# CHECK-UNKNOWN: 02b50407 <unknown>
 
 vle8.v v8, (a0), v0.t
 # CHECK-INST: vle8.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 00 <unknown>
+# CHECK-UNKNOWN: 00050407 <unknown>
 
 vle8.v v8, (a0)
 # CHECK-INST: vle8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 02 <unknown>
+# CHECK-UNKNOWN: 02050407 <unknown>
 
 vle16.v v8, (a0), v0.t
 # CHECK-INST: vle16.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 00 <unknown>
+# CHECK-UNKNOWN: 00055407 <unknown>
 
 vle16.v v8, (a0)
 # CHECK-INST: vle16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 02 <unknown>
+# CHECK-UNKNOWN: 02055407 <unknown>
 
 vle32.v v8, (a0), v0.t
 # CHECK-INST: vle32.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 00 <unknown>
+# CHECK-UNKNOWN: 00056407 <unknown>
 
 vle32.v v8, (a0)
 # CHECK-INST: vle32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 02 <unknown>
+# CHECK-UNKNOWN: 02056407 <unknown>
 
 vle64.v v8, (a0), v0.t
 # CHECK-INST: vle64.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 00 <unknown>
+# CHECK-UNKNOWN: 00057407 <unknown>
 
 vle64.v v8, (a0)
 # CHECK-INST: vle64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 02 <unknown>
+# CHECK-UNKNOWN: 02057407 <unknown>
 
 vle8ff.v v8, (a0), v0.t
 # CHECK-INST: vle8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x01]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 01 <unknown>
+# CHECK-UNKNOWN: 01050407 <unknown>
 
 vle8ff.v v8, (a0)
 # CHECK-INST: vle8ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x03]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 03 <unknown>
+# CHECK-UNKNOWN: 03050407 <unknown>
 
 vle16ff.v v8, (a0), v0.t
 # CHECK-INST: vle16ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0x01]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 01 <unknown>
+# CHECK-UNKNOWN: 01055407 <unknown>
 
 vle16ff.v v8, (a0)
 # CHECK-INST: vle16ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x03]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 03 <unknown>
+# CHECK-UNKNOWN: 03055407 <unknown>
 
 vle32ff.v v8, (a0), v0.t
 # CHECK-INST: vle32ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0x01]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 01 <unknown>
+# CHECK-UNKNOWN: 01056407 <unknown>
 
 vle32ff.v v8, (a0)
 # CHECK-INST: vle32ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0x03]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 03 <unknown>
+# CHECK-UNKNOWN: 03056407 <unknown>
 
 vle64ff.v v8, (a0), v0.t
 # CHECK-INST: vle64ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0x01]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 01 <unknown>
+# CHECK-UNKNOWN: 01057407 <unknown>
 
 vle64ff.v v8, (a0)
 # CHECK-INST: vle64ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0x03]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 03 <unknown>
+# CHECK-UNKNOWN: 03057407 <unknown>
 
 vlse8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlse8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 08 <unknown>
+# CHECK-UNKNOWN: 08b50407 <unknown>
 
 vlse8.v v8, (a0), a1
 # CHECK-INST: vlse8.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 0a <unknown>
+# CHECK-UNKNOWN: 0ab50407 <unknown>
 
 vlse16.v v8, (a0), a1, v0.t
 # CHECK-INST: vlse16.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 08 <unknown>
+# CHECK-UNKNOWN: 08b55407 <unknown>
 
 vlse16.v v8, (a0), a1
 # CHECK-INST: vlse16.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 0a <unknown>
+# CHECK-UNKNOWN: 0ab55407 <unknown>
 
 vlse32.v v8, (a0), a1, v0.t
 # CHECK-INST: vlse32.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x64,0xb5,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 08 <unknown>
+# CHECK-UNKNOWN: 08b56407 <unknown>
 
 vlse32.v v8, (a0), a1
 # CHECK-INST: vlse32.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x64,0xb5,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 0a <unknown>
+# CHECK-UNKNOWN: 0ab56407 <unknown>
 
 vlse64.v v8, (a0), a1, v0.t
 # CHECK-INST: vlse64.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x74,0xb5,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 08 <unknown>
+# CHECK-UNKNOWN: 08b57407 <unknown>
 
 vlse64.v v8, (a0), a1
 # CHECK-INST: vlse64.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x74,0xb5,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 0a <unknown>
+# CHECK-UNKNOWN: 0ab57407 <unknown>
 
 vluxei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x04]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 04 <unknown>
+# CHECK-UNKNOWN: 04450407 <unknown>
 
 vluxei8.v v8, (a0), v4
 # CHECK-INST: vluxei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0x06]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 06 <unknown>
+# CHECK-UNKNOWN: 06450407 <unknown>
 
 vluxei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0x04]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 04 <unknown>
+# CHECK-UNKNOWN: 04455407 <unknown>
 
 vluxei16.v v8, (a0), v4
 # CHECK-INST: vluxei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x06]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 06 <unknown>
+# CHECK-UNKNOWN: 06455407 <unknown>
 
 vluxei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0x04]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 04 <unknown>
+# CHECK-UNKNOWN: 04456407 <unknown>
 
 vluxei32.v v8, (a0), v4
 # CHECK-INST: vluxei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x06]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 06 <unknown>
+# CHECK-UNKNOWN: 06456407 <unknown>
 
 vluxei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x04]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 04 <unknown>
+# CHECK-UNKNOWN: 04457407 <unknown>
 
 vluxei64.v v8, (a0), v4
 # CHECK-INST: vluxei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x06]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 06 <unknown>
+# CHECK-UNKNOWN: 06457407 <unknown>
 
 vloxei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 0c <unknown>
+# CHECK-UNKNOWN: 0c450407 <unknown>
 
 vloxei8.v v8, (a0), v4
 # CHECK-INST: vloxei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 0e <unknown>
+# CHECK-UNKNOWN: 0e450407 <unknown>
 
 vloxei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 0c <unknown>
+# CHECK-UNKNOWN: 0c455407 <unknown>
 
 vloxei16.v v8, (a0), v4
 # CHECK-INST: vloxei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 0e <unknown>
+# CHECK-UNKNOWN: 0e455407 <unknown>
 
 vloxei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 0c <unknown>
+# CHECK-UNKNOWN: 0c456407 <unknown>
 
 vloxei32.v v8, (a0), v4
 # CHECK-INST: vloxei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 0e <unknown>
+# CHECK-UNKNOWN: 0e456407 <unknown>
 
 vloxei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 0c <unknown>
+# CHECK-UNKNOWN: 0c457407 <unknown>
 
 vloxei64.v v8, (a0), v4
 # CHECK-INST: vloxei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 0e <unknown>
+# CHECK-UNKNOWN: 0e457407 <unknown>
 
 vl1re8.v v8, (a0)
 # CHECK-INST: vl1re8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x85,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 85 02 <unknown>
+# CHECK-UNKNOWN: 02850407 <unknown>
 
 vl1re16.v v8, (a0)
 # CHECK-INST: vl1re16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x85,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 85 02 <unknown>
+# CHECK-UNKNOWN: 02855407 <unknown>
 
 vl1re32.v v8, (a0)
 # CHECK-INST: vl1re32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x85,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 85 02 <unknown>
+# CHECK-UNKNOWN: 02856407 <unknown>
 
 vl1re64.v v8, (a0)
 # CHECK-INST: vl1re64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x85,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 85 02 <unknown>
+# CHECK-UNKNOWN: 02857407 <unknown>
 
 vl2re8.v v8, (a0)
 # CHECK-INST: vl2re8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x85,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 85 22 <unknown>
+# CHECK-UNKNOWN: 22850407 <unknown>
 
 vl2re16.v v8, (a0)
 # CHECK-INST: vl2re16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x85,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 85 22 <unknown>
+# CHECK-UNKNOWN: 22855407 <unknown>
 
 vl2re32.v v8, (a0)
 # CHECK-INST: vl2re32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x85,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 85 22 <unknown>
+# CHECK-UNKNOWN: 22856407 <unknown>
 
 vl2re64.v v8, (a0)
 # CHECK-INST: vl2re64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x85,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 85 22 <unknown>
+# CHECK-UNKNOWN: 22857407 <unknown>
 
 vl4re8.v v8, (a0)
 # CHECK-INST: vl4re8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x85,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 85 62 <unknown>
+# CHECK-UNKNOWN: 62850407 <unknown>
 
 vl4re16.v v8, (a0)
 # CHECK-INST: vl4re16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x85,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 85 62 <unknown>
+# CHECK-UNKNOWN: 62855407 <unknown>
 
 vl4re32.v v8, (a0)
 # CHECK-INST: vl4re32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x85,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 85 62 <unknown>
+# CHECK-UNKNOWN: 62856407 <unknown>
 
 vl4re64.v v8, (a0)
 # CHECK-INST: vl4re64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x85,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 85 62 <unknown>
+# CHECK-UNKNOWN: 62857407 <unknown>
 
 vl8re8.v v8, (a0)
 # CHECK-INST: vl8re8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x85,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 85 e2 <unknown>
+# CHECK-UNKNOWN: e2850407 <unknown>
 
 vl8re16.v v8, (a0)
 # CHECK-INST: vl8re16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x85,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 85 e2 <unknown>
+# CHECK-UNKNOWN: e2855407 <unknown>
 
 vl8re32.v v8, (a0)
 # CHECK-INST: vl8re32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x85,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 85 e2 <unknown>
+# CHECK-UNKNOWN: e2856407 <unknown>
 
 vl8re64.v v8, (a0)
 # CHECK-INST: vl8re64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x85,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 85 e2 <unknown>
+# CHECK-UNKNOWN: e2857407 <unknown>
 
 vlm.v v0, 0(a0)
 # CHECK-INST: vlm.v v0, (a0)
 # CHECK-ENCODING: [0x07,0x00,0xb5,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 00 b5 02 <unknown>
+# CHECK-UNKNOWN: 02b50007 <unknown>
 
 vle8.v v8, 0(a0)
 # CHECK-INST: vle8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 02 <unknown>
+# CHECK-UNKNOWN: 02050407 <unknown>
 
 vle8ff.v v8, 0(a0), v0.t
 # CHECK-INST: vle8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x01]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 01 <unknown>
+# CHECK-UNKNOWN: 01050407 <unknown>
 
 vlse16.v v8, 0(a0), a1, v0.t
 # CHECK-INST: vlse16.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 08 <unknown>
+# CHECK-UNKNOWN: 08b55407 <unknown>
 
 vluxei32.v v8, 0(a0), v4
 # CHECK-INST: vluxei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x06]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 06 <unknown>
+# CHECK-UNKNOWN: 06456407 <unknown>
 
 vloxei64.v v8, 0(a0), v4
 # CHECK-INST: vloxei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 0e <unknown>
+# CHECK-UNKNOWN: 0e457407 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/macc.s b/llvm/test/MC/RISCV/rvv/macc.s
index 0662620b455a..ffdc2d646929 100644
--- a/llvm/test/MC/RISCV/rvv/macc.s
+++ b/llvm/test/MC/RISCV/rvv/macc.s
@@ -12,178 +12,178 @@ vmacc.vv v8, v20, v4, v0.t
 # CHECK-INST: vmacc.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xb4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a b4 <unknown>
+# CHECK-UNKNOWN: b44a2457 <unknown>
 
 vmacc.vv v8, v20, v4
 # CHECK-INST: vmacc.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xb6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a b6 <unknown>
+# CHECK-UNKNOWN: b64a2457 <unknown>
 
 vmacc.vx v8, a0, v4, v0.t
 # CHECK-INST: vmacc.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xb4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 b4 <unknown>
+# CHECK-UNKNOWN: b4456457 <unknown>
 
 vmacc.vx v8, a0, v4
 # CHECK-INST: vmacc.vx v8, a0, v4
 # CHECK-ENCODING: [0x57,0x64,0x45,0xb6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 b6 <unknown>
+# CHECK-UNKNOWN: b6456457 <unknown>
 
 vnmsac.vv v8, v20, v4, v0.t
 # CHECK-INST: vnmsac.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xbc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a bc <unknown>
+# CHECK-UNKNOWN: bc4a2457 <unknown>
 
 vnmsac.vv v8, v20, v4
 # CHECK-INST: vnmsac.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xbe]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a be <unknown>
+# CHECK-UNKNOWN: be4a2457 <unknown>
 
 vnmsac.vx v8, a0, v4, v0.t
 # CHECK-INST: vnmsac.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xbc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 bc <unknown>
+# CHECK-UNKNOWN: bc456457 <unknown>
 
 vnmsac.vx v8, a0, v4
 # CHECK-INST: vnmsac.vx v8, a0, v4
 # CHECK-ENCODING: [0x57,0x64,0x45,0xbe]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 be <unknown>
+# CHECK-UNKNOWN: be456457 <unknown>
 
 vmadd.vv v8, v20, v4, v0.t
 # CHECK-INST: vmadd.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a a4 <unknown>
+# CHECK-UNKNOWN: a44a2457 <unknown>
 
 vmadd.vv v8, v20, v4
 # CHECK-INST: vmadd.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a a6 <unknown>
+# CHECK-UNKNOWN: a64a2457 <unknown>
 
 vmadd.vx v8, a0, v4, v0.t
 # CHECK-INST: vmadd.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 a4 <unknown>
+# CHECK-UNKNOWN: a4456457 <unknown>
 
 vmadd.vx v8, a0, v4
 # CHECK-INST: vmadd.vx v8, a0, v4
 # CHECK-ENCODING: [0x57,0x64,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 a6 <unknown>
+# CHECK-UNKNOWN: a6456457 <unknown>
 
 vnmsub.vv v8, v20, v4, v0.t
 # CHECK-INST: vnmsub.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a ac <unknown>
+# CHECK-UNKNOWN: ac4a2457 <unknown>
 
 vnmsub.vv v8, v20, v4
 # CHECK-INST: vnmsub.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a ae <unknown>
+# CHECK-UNKNOWN: ae4a2457 <unknown>
 
 vnmsub.vx v8, a0, v4, v0.t
 # CHECK-INST: vnmsub.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 ac <unknown>
+# CHECK-UNKNOWN: ac456457 <unknown>
 
 vnmsub.vx v8, a0, v4
 # CHECK-INST: vnmsub.vx v8, a0, v4
 # CHECK-ENCODING: [0x57,0x64,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 ae <unknown>
+# CHECK-UNKNOWN: ae456457 <unknown>
 
 vwmaccu.vv v8, v20, v4, v0.t
 # CHECK-INST: vwmaccu.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xf0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a f0 <unknown>
+# CHECK-UNKNOWN: f04a2457 <unknown>
 
 vwmaccu.vv v8, v20, v4
 # CHECK-INST: vwmaccu.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xf2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a f2 <unknown>
+# CHECK-UNKNOWN: f24a2457 <unknown>
 
 vwmaccu.vx v8, a0, v4, v0.t
 # CHECK-INST: vwmaccu.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xf0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 f0 <unknown>
+# CHECK-UNKNOWN: f0456457 <unknown>
 
 vwmaccu.vx v8, a0, v4
 # CHECK-INST: vwmaccu.vx v8, a0, v4
 # CHECK-ENCODING: [0x57,0x64,0x45,0xf2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 f2 <unknown>
+# CHECK-UNKNOWN: f2456457 <unknown>
 
 vwmacc.vv v8, v20, v4, v0.t
 # CHECK-INST: vwmacc.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xf4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a f4 <unknown>
+# CHECK-UNKNOWN: f44a2457 <unknown>
 
 vwmacc.vv v8, v20, v4
 # CHECK-INST: vwmacc.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xf6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a f6 <unknown>
+# CHECK-UNKNOWN: f64a2457 <unknown>
 
 vwmacc.vx v8, a0, v4, v0.t
 # CHECK-INST: vwmacc.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xf4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 f4 <unknown>
+# CHECK-UNKNOWN: f4456457 <unknown>
 
 vwmacc.vx v8, a0, v4
 # CHECK-INST: vwmacc.vx v8, a0, v4
 # CHECK-ENCODING: [0x57,0x64,0x45,0xf6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 f6 <unknown>
+# CHECK-UNKNOWN: f6456457 <unknown>
 
 vwmaccsu.vv v8, v20, v4, v0.t
 # CHECK-INST: vwmaccsu.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xfc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a fc <unknown>
+# CHECK-UNKNOWN: fc4a2457 <unknown>
 
 vwmaccsu.vv v8, v20, v4
 # CHECK-INST: vwmaccsu.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xfe]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a fe <unknown>
+# CHECK-UNKNOWN: fe4a2457 <unknown>
 
 vwmaccsu.vx v8, a0, v4, v0.t
 # CHECK-INST: vwmaccsu.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xfc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 fc <unknown>
+# CHECK-UNKNOWN: fc456457 <unknown>
 
 vwmaccsu.vx v8, a0, v4
 # CHECK-INST: vwmaccsu.vx v8, a0, v4
 # CHECK-ENCODING: [0x57,0x64,0x45,0xfe]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 fe <unknown>
+# CHECK-UNKNOWN: fe456457 <unknown>
 
 vwmaccus.vx v8, a0, v4, v0.t
 # CHECK-INST: vwmaccus.vx v8, a0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xf8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 f8 <unknown>
+# CHECK-UNKNOWN: f8456457 <unknown>
 
 vwmaccus.vx v8, a0, v4
 # CHECK-INST: vwmaccus.vx v8, a0, v4
 # CHECK-ENCODING: [0x57,0x64,0x45,0xfa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 fa <unknown>
+# CHECK-UNKNOWN: fa456457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/mask.s b/llvm/test/MC/RISCV/rvv/mask.s
index c0691b97f1d9..c0dd44b6fc67 100644
--- a/llvm/test/MC/RISCV/rvv/mask.s
+++ b/llvm/test/MC/RISCV/rvv/mask.s
@@ -12,154 +12,154 @@ vmand.mm v8, v4, v20
 # CHECK-INST: vmand.mm v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 66 <unknown>
+# CHECK-UNKNOWN: 664a2457 <unknown>
 
 vmnand.mm v8, v4, v20
 # CHECK-INST: vmnand.mm v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x76]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 76 <unknown>
+# CHECK-UNKNOWN: 764a2457 <unknown>
 
 vmandn.mm v8, v4, v20
 # CHECK-INST: vmandn.mm v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 62 <unknown>
+# CHECK-UNKNOWN: 624a2457 <unknown>
 
 vmxor.mm v8, v4, v20
 # CHECK-INST: vmxor.mm v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 6e <unknown>
+# CHECK-UNKNOWN: 6e4a2457 <unknown>
 
 vmor.mm v8, v4, v20
 # CHECK-INST: vmor.mm v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 6a <unknown>
+# CHECK-UNKNOWN: 6a4a2457 <unknown>
 
 vmnor.mm v8, v4, v20
 # CHECK-INST: vmnor.mm v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x7a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 7a <unknown>
+# CHECK-UNKNOWN: 7a4a2457 <unknown>
 
 vmorn.mm v8, v4, v20
 # CHECK-INST: vmorn.mm v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x72]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 72 <unknown>
+# CHECK-UNKNOWN: 724a2457 <unknown>
 
 vmxnor.mm v8, v4, v20
 # CHECK-INST: vmxnor.mm v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x7e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 7e <unknown>
+# CHECK-UNKNOWN: 7e4a2457 <unknown>
 
 vcpop.m a2, v4, v0.t
 # CHECK-INST: vcpop.m a2, v4, v0.t
 # CHECK-ENCODING: [0x57,0x26,0x48,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 26 48 40 <unknown>
+# CHECK-UNKNOWN: 40482657 <unknown>
 
 vcpop.m a2, v4
 # CHECK-INST: vcpop.m a2, v4
 # CHECK-ENCODING: [0x57,0x26,0x48,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 26 48 42 <unknown>
+# CHECK-UNKNOWN: 42482657 <unknown>
 
 vfirst.m a2, v4, v0.t
 # CHECK-INST: vfirst.m a2, v4, v0.t
 # CHECK-ENCODING: [0x57,0xa6,0x48,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a6 48 40 <unknown>
+# CHECK-UNKNOWN: 4048a657 <unknown>
 
 vfirst.m a2, v4
 # CHECK-INST: vfirst.m a2, v4
 # CHECK-ENCODING: [0x57,0xa6,0x48,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a6 48 42 <unknown>
+# CHECK-UNKNOWN: 4248a657 <unknown>
 
 vmsbf.m v8, v4, v0.t
 # CHECK-INST: vmsbf.m v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0xa4,0x40,0x50]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 40 50 <unknown>
+# CHECK-UNKNOWN: 5040a457 <unknown>
 
 vmsbf.m v8, v4
 # CHECK-INST: vmsbf.m v8, v4
 # CHECK-ENCODING: [0x57,0xa4,0x40,0x52]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 40 52 <unknown>
+# CHECK-UNKNOWN: 5240a457 <unknown>
 
 vmsif.m v8, v4, v0.t
 # CHECK-INST: vmsif.m v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0xa4,0x41,0x50]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 41 50 <unknown>
+# CHECK-UNKNOWN: 5041a457 <unknown>
 
 vmsif.m v8, v4
 # CHECK-INST: vmsif.m v8, v4
 # CHECK-ENCODING: [0x57,0xa4,0x41,0x52]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 41 52 <unknown>
+# CHECK-UNKNOWN: 5241a457 <unknown>
 
 vmsof.m v8, v4, v0.t
 # CHECK-INST: vmsof.m v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x41,0x50]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 41 50 <unknown>
+# CHECK-UNKNOWN: 50412457 <unknown>
 
 vmsof.m v8, v4
 # CHECK-INST: vmsof.m v8, v4
 # CHECK-ENCODING: [0x57,0x24,0x41,0x52]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 41 52 <unknown>
+# CHECK-UNKNOWN: 52412457 <unknown>
 
 viota.m v8, v4, v0.t
 # CHECK-INST: viota.m v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x48,0x50]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 48 50 <unknown>
+# CHECK-UNKNOWN: 50482457 <unknown>
 
 viota.m v8, v4
 # CHECK-INST: viota.m v8, v4
 # CHECK-ENCODING: [0x57,0x24,0x48,0x52]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 48 52 <unknown>
+# CHECK-UNKNOWN: 52482457 <unknown>
 
 vid.v v8, v0.t
 # CHECK-INST: vid.v v8, v0.t
 # CHECK-ENCODING: [0x57,0xa4,0x08,0x50]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 08 50 <unknown>
+# CHECK-UNKNOWN: 5008a457 <unknown>
 
 vid.v v8
 # CHECK-INST: vid.v v8
 # CHECK-ENCODING: [0x57,0xa4,0x08,0x52]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 a4 08 52 <unknown>
+# CHECK-UNKNOWN: 5208a457 <unknown>
 
 vmmv.m v8, v4
 # CHECK-INST: vmmv.m v8, v4
 # CHECK-ENCODING: [0x57,0x24,0x42,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 42 66 <unknown>
+# CHECK-UNKNOWN: 66422457 <unknown>
 
 vmclr.m v8
 # CHECK-INST: vmclr.m v8
 # CHECK-ENCODING: [0x57,0x24,0x84,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 84 6e <unknown>
+# CHECK-UNKNOWN: 6e842457 <unknown>
 
 vmset.m v8
 # CHECK-INST: vmset.m v8
 # CHECK-ENCODING: [0x57,0x24,0x84,0x7e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 84 7e <unknown>
+# CHECK-UNKNOWN: 7e842457 <unknown>
 
 vmnot.m v8, v4
 # CHECK-INST: vmnot.m v8, v4
 # CHECK-ENCODING: [0x57,0x24,0x42,0x76]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 42 76 <unknown>
+# CHECK-UNKNOWN: 76422457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/minmax.s b/llvm/test/MC/RISCV/rvv/minmax.s
index 4eaf897b84c4..70fe040f07f9 100644
--- a/llvm/test/MC/RISCV/rvv/minmax.s
+++ b/llvm/test/MC/RISCV/rvv/minmax.s
@@ -12,94 +12,94 @@ vminu.vv v8, v4, v20, v0.t
 # CHECK-INST: vminu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x10]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 10 <unknown>
+# CHECK-UNKNOWN: 104a0457 <unknown>
 
 vminu.vv v8, v4, v20
 # CHECK-INST: vminu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x12]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 12 <unknown>
+# CHECK-UNKNOWN: 124a0457 <unknown>
 
 vminu.vx v8, v4, a0, v0.t
 # CHECK-INST: vminu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x10]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 10 <unknown>
+# CHECK-UNKNOWN: 10454457 <unknown>
 
 vminu.vx v8, v4, a0
 # CHECK-INST: vminu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x12]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 12 <unknown>
+# CHECK-UNKNOWN: 12454457 <unknown>
 
 vmin.vv v8, v4, v20, v0.t
 # CHECK-INST: vmin.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x14]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 14 <unknown>
+# CHECK-UNKNOWN: 144a0457 <unknown>
 
 vmin.vv v8, v4, v20
 # CHECK-INST: vmin.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x16]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 16 <unknown>
+# CHECK-UNKNOWN: 164a0457 <unknown>
 
 vmin.vx v8, v4, a0, v0.t
 # CHECK-INST: vmin.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x14]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 14 <unknown>
+# CHECK-UNKNOWN: 14454457 <unknown>
 
 vmin.vx v8, v4, a0
 # CHECK-INST: vmin.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x16]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 16 <unknown>
+# CHECK-UNKNOWN: 16454457 <unknown>
 
 vmaxu.vv v8, v4, v20, v0.t
 # CHECK-INST: vmaxu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x18]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 18 <unknown>
+# CHECK-UNKNOWN: 184a0457 <unknown>
 
 vmaxu.vv v8, v4, v20
 # CHECK-INST: vmaxu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x1a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 1a <unknown>
+# CHECK-UNKNOWN: 1a4a0457 <unknown>
 
 vmaxu.vx v8, v4, a0, v0.t
 # CHECK-INST: vmaxu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x18]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 18 <unknown>
+# CHECK-UNKNOWN: 18454457 <unknown>
 
 vmaxu.vx v8, v4, a0
 # CHECK-INST: vmaxu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x1a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 1a <unknown>
+# CHECK-UNKNOWN: 1a454457 <unknown>
 
 vmax.vv v8, v4, v20, v0.t
 # CHECK-INST: vmax.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x1c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 1c <unknown>
+# CHECK-UNKNOWN: 1c4a0457 <unknown>
 
 vmax.vv v8, v4, v20
 # CHECK-INST: vmax.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x1e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 1e <unknown>
+# CHECK-UNKNOWN: 1e4a0457 <unknown>
 
 vmax.vx v8, v4, a0, v0.t
 # CHECK-INST: vmax.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x1c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 1c <unknown>
+# CHECK-UNKNOWN: 1c454457 <unknown>
 
 vmax.vx v8, v4, a0
 # CHECK-INST: vmax.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x1e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 1e <unknown>
+# CHECK-UNKNOWN: 1e454457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/mul.s b/llvm/test/MC/RISCV/rvv/mul.s
index 9f7d6182d80f..2782ea683f9c 100644
--- a/llvm/test/MC/RISCV/rvv/mul.s
+++ b/llvm/test/MC/RISCV/rvv/mul.s
@@ -12,190 +12,190 @@ vmul.vv v8, v4, v20, v0.t
 # CHECK-INST: vmul.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x94]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 94 <unknown>
+# CHECK-UNKNOWN: 944a2457 <unknown>
 
 vmul.vv v8, v4, v20
 # CHECK-INST: vmul.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x96]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 96 <unknown>
+# CHECK-UNKNOWN: 964a2457 <unknown>
 
 vmul.vx v8, v4, a0, v0.t
 # CHECK-INST: vmul.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x94]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 94 <unknown>
+# CHECK-UNKNOWN: 94456457 <unknown>
 
 vmul.vx v8, v4, a0
 # CHECK-INST: vmul.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x96]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 96 <unknown>
+# CHECK-UNKNOWN: 96456457 <unknown>
 
 vmulh.vv v8, v4, v20, v0.t
 # CHECK-INST: vmulh.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x9c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 9c <unknown>
+# CHECK-UNKNOWN: 9c4a2457 <unknown>
 
 vmulh.vv v8, v4, v20
 # CHECK-INST: vmulh.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x9e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 9e <unknown>
+# CHECK-UNKNOWN: 9e4a2457 <unknown>
 
 vmulh.vx v8, v4, a0, v0.t
 # CHECK-INST: vmulh.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x9c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 9c <unknown>
+# CHECK-UNKNOWN: 9c456457 <unknown>
 
 vmulh.vx v8, v4, a0
 # CHECK-INST: vmulh.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x9e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 9e <unknown>
+# CHECK-UNKNOWN: 9e456457 <unknown>
 
 vmulhu.vv v8, v4, v20, v0.t
 # CHECK-INST: vmulhu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x90]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 90 <unknown>
+# CHECK-UNKNOWN: 904a2457 <unknown>
 
 vmulhu.vv v8, v4, v20
 # CHECK-INST: vmulhu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x92]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 92 <unknown>
+# CHECK-UNKNOWN: 924a2457 <unknown>
 
 vmulhu.vx v8, v4, a0, v0.t
 # CHECK-INST: vmulhu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x90]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 90 <unknown>
+# CHECK-UNKNOWN: 90456457 <unknown>
 
 vmulhu.vx v8, v4, a0
 # CHECK-INST: vmulhu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x92]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 92 <unknown>
+# CHECK-UNKNOWN: 92456457 <unknown>
 
 vmulhsu.vv v8, v4, v20, v0.t
 # CHECK-INST: vmulhsu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x98]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 98 <unknown>
+# CHECK-UNKNOWN: 984a2457 <unknown>
 
 vmulhsu.vv v8, v4, v20
 # CHECK-INST: vmulhsu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x9a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 9a <unknown>
+# CHECK-UNKNOWN: 9a4a2457 <unknown>
 
 vmulhsu.vx v8, v4, a0, v0.t
 # CHECK-INST: vmulhsu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x98]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 98 <unknown>
+# CHECK-UNKNOWN: 98456457 <unknown>
 
 vmulhsu.vx v8, v4, a0
 # CHECK-INST: vmulhsu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x9a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 9a <unknown>
+# CHECK-UNKNOWN: 9a456457 <unknown>
 
 vwmul.vv v8, v4, v20, v0.t
 # CHECK-INST: vwmul.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xec]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a ec <unknown>
+# CHECK-UNKNOWN: ec4a2457 <unknown>
 
 vwmul.vv v8, v4, v20
 # CHECK-INST: vwmul.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xee]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a ee <unknown>
+# CHECK-UNKNOWN: ee4a2457 <unknown>
 
 vwmul.vx v8, v4, a0, v0.t
 # CHECK-INST: vwmul.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xec]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 ec <unknown>
+# CHECK-UNKNOWN: ec456457 <unknown>
 
 vwmul.vx v8, v4, a0
 # CHECK-INST: vwmul.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xee]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 ee <unknown>
+# CHECK-UNKNOWN: ee456457 <unknown>
 
 vwmulu.vv v8, v4, v20, v0.t
 # CHECK-INST: vwmulu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a e0 <unknown>
+# CHECK-UNKNOWN: e04a2457 <unknown>
 
 vwmulu.vv v8, v4, v20
 # CHECK-INST: vwmulu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a e2 <unknown>
+# CHECK-UNKNOWN: e24a2457 <unknown>
 
 vwmulu.vx v8, v4, a0, v0.t
 # CHECK-INST: vwmulu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 e0 <unknown>
+# CHECK-UNKNOWN: e0456457 <unknown>
 
 vwmulu.vx v8, v4, a0
 # CHECK-INST: vwmulu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 e2 <unknown>
+# CHECK-UNKNOWN: e2456457 <unknown>
 
 vwmulsu.vv v8, v4, v20, v0.t
 # CHECK-INST: vwmulsu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xe8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a e8 <unknown>
+# CHECK-UNKNOWN: e84a2457 <unknown>
 
 vwmulsu.vv v8, v4, v20
 # CHECK-INST: vwmulsu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xea]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a ea <unknown>
+# CHECK-UNKNOWN: ea4a2457 <unknown>
 
 vwmulsu.vx v8, v4, a0, v0.t
 # CHECK-INST: vwmulsu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xe8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 e8 <unknown>
+# CHECK-UNKNOWN: e8456457 <unknown>
 
 vwmulsu.vx v8, v4, a0
 # CHECK-INST: vwmulsu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xea]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 ea <unknown>
+# CHECK-UNKNOWN: ea456457 <unknown>
 
 vsmul.vv v8, v4, v20, v0.t
 # CHECK-INST: vsmul.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x9c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 9c <unknown>
+# CHECK-UNKNOWN: 9c4a0457 <unknown>
 
 vsmul.vv v8, v4, v20
 # CHECK-INST: vsmul.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x9e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 9e <unknown>
+# CHECK-UNKNOWN: 9e4a0457 <unknown>
 
 vsmul.vx v8, v4, a0, v0.t
 # CHECK-INST: vsmul.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x9c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 9c <unknown>
+# CHECK-UNKNOWN: 9c454457 <unknown>
 
 vsmul.vx v8, v4, a0
 # CHECK-INST: vsmul.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x9e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 9e <unknown>
+# CHECK-UNKNOWN: 9e454457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/mv.s b/llvm/test/MC/RISCV/rvv/mv.s
index cd02aa94d18e..f96e14932ead 100644
--- a/llvm/test/MC/RISCV/rvv/mv.s
+++ b/llvm/test/MC/RISCV/rvv/mv.s
@@ -12,52 +12,52 @@ vmv.v.v v8, v20
 # CHECK-INST: vmv.v.v v8, v20
 # CHECK-ENCODING: [0x57,0x04,0x0a,0x5e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 0a 5e <unknown>
+# CHECK-UNKNOWN: 5e0a0457 <unknown>
 
 vmv.v.x v8, a0
 # CHECK-INST: vmv.v.x v8, a0
 # CHECK-ENCODING: [0x57,0x44,0x05,0x5e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 05 5e <unknown>
+# CHECK-UNKNOWN: 5e054457 <unknown>
 
 vmv.v.i v8, 15
 # CHECK-INST: vmv.v.i v8, 15
 # CHECK-ENCODING: [0x57,0xb4,0x07,0x5e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 07 5e <unknown>
+# CHECK-UNKNOWN: 5e07b457 <unknown>
 
 vmv.x.s a2, v4
 # CHECK-INST: vmv.x.s a2, v4
 # CHECK-ENCODING: [0x57,0x26,0x40,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 26 40 42 <unknown>
+# CHECK-UNKNOWN: 42402657 <unknown>
 
 vmv.s.x v8, a0
 # CHECK-INST: vmv.s.x v8, a0
 # CHECK-ENCODING: [0x57,0x64,0x05,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 05 42 <unknown>
+# CHECK-UNKNOWN: 42056457 <unknown>
 
 vmv1r.v v8, v4
 # CHECK-INST: vmv1r.v v8, v4
 # CHECK-ENCODING: [0x57,0x34,0x40,0x9e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 34 40 9e <unknown>
+# CHECK-UNKNOWN: 9e403457 <unknown>
 
 vmv2r.v v8, v4
 # CHECK-INST: vmv2r.v v8, v4
 # CHECK-ENCODING: [0x57,0xb4,0x40,0x9e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 40 9e <unknown>
+# CHECK-UNKNOWN: 9e40b457 <unknown>
 
 vmv4r.v v8, v4
 # CHECK-INST: vmv4r.v v8, v4
 # CHECK-ENCODING: [0x57,0xb4,0x41,0x9e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 41 9e <unknown>
+# CHECK-UNKNOWN: 9e41b457 <unknown>
 
 vmv8r.v v8, v24
 # CHECK-INST: vmv8r.v v8, v24
 # CHECK-ENCODING: [0x57,0xb4,0x83,0x9f]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 83 9f <unknown>
+# CHECK-UNKNOWN: 9f83b457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/or.s b/llvm/test/MC/RISCV/rvv/or.s
index ef281fe80bd4..306d7ae81442 100644
--- a/llvm/test/MC/RISCV/rvv/or.s
+++ b/llvm/test/MC/RISCV/rvv/or.s
@@ -12,34 +12,34 @@ vor.vv v8, v4, v20, v0.t
 # CHECK-INST: vor.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 28 <unknown>
+# CHECK-UNKNOWN: 284a0457 <unknown>
 
 vor.vv v8, v4, v20
 # CHECK-INST: vor.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 2a <unknown>
+# CHECK-UNKNOWN: 2a4a0457 <unknown>
 
 vor.vx v8, v4, a0, v0.t
 # CHECK-INST: vor.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 28 <unknown>
+# CHECK-UNKNOWN: 28454457 <unknown>
 
 vor.vx v8, v4, a0
 # CHECK-INST: vor.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 2a <unknown>
+# CHECK-UNKNOWN: 2a454457 <unknown>
 
 vor.vi v8, v4, 15, v0.t
 # CHECK-INST: vor.vi v8, v4, 15, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 28 <unknown>
+# CHECK-UNKNOWN: 2847b457 <unknown>
 
 vor.vi v8, v4, 15
 # CHECK-INST: vor.vi v8, v4, 15
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 2a <unknown>
+# CHECK-UNKNOWN: 2a47b457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/others.s b/llvm/test/MC/RISCV/rvv/others.s
index d1845e0bb238..cc16a8774b82 100644
--- a/llvm/test/MC/RISCV/rvv/others.s
+++ b/llvm/test/MC/RISCV/rvv/others.s
@@ -12,142 +12,142 @@ vmerge.vvm v8, v4, v20, v0
 # CHECK-INST: vmerge.vvm v8, v4, v20, v0
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x5c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 5c <unknown>
+# CHECK-UNKNOWN: 5c4a0457 <unknown>
 
 vmerge.vxm v8, v4, a0, v0
 # CHECK-INST: vmerge.vxm v8, v4, a0, v0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x5c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 5c <unknown>
+# CHECK-UNKNOWN: 5c454457 <unknown>
 
 vmerge.vim v8, v4, 15, v0
 # CHECK-INST: vmerge.vim v8, v4, 15, v0
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x5c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 5c <unknown>
+# CHECK-UNKNOWN: 5c47b457 <unknown>
 
 vslideup.vx v8, v4, a0, v0.t
 # CHECK-INST: vslideup.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x38]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 38 <unknown>
+# CHECK-UNKNOWN: 38454457 <unknown>
 
 vslideup.vx v8, v4, a0
 # CHECK-INST: vslideup.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x3a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 3a <unknown>
+# CHECK-UNKNOWN: 3a454457 <unknown>
 
 vslideup.vi v8, v4, 31, v0.t
 # CHECK-INST: vslideup.vi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0x38]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f 38 <unknown>
+# CHECK-UNKNOWN: 384fb457 <unknown>
 
 vslideup.vi v8, v4, 31
 # CHECK-INST: vslideup.vi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0x3a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f 3a <unknown>
+# CHECK-UNKNOWN: 3a4fb457 <unknown>
 
 vslidedown.vx v8, v4, a0, v0.t
 # CHECK-INST: vslidedown.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x3c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 3c <unknown>
+# CHECK-UNKNOWN: 3c454457 <unknown>
 
 vslidedown.vx v8, v4, a0
 # CHECK-INST: vslidedown.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x3e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 3e <unknown>
+# CHECK-UNKNOWN: 3e454457 <unknown>
 
 vslidedown.vi v8, v4, 31, v0.t
 # CHECK-INST: vslidedown.vi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0x3c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f 3c <unknown>
+# CHECK-UNKNOWN: 3c4fb457 <unknown>
 
 vslidedown.vi v8, v4, 31
 # CHECK-INST: vslidedown.vi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0x3e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f 3e <unknown>
+# CHECK-UNKNOWN: 3e4fb457 <unknown>
 
 vslide1up.vx v8, v4, a0, v0.t
 # CHECK-INST: vslide1up.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x38]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 38 <unknown>
+# CHECK-UNKNOWN: 38456457 <unknown>
 
 vslide1up.vx v8, v4, a0
 # CHECK-INST: vslide1up.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x3a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 3a <unknown>
+# CHECK-UNKNOWN: 3a456457 <unknown>
 
 vslide1down.vx v8, v4, a0, v0.t
 # CHECK-INST: vslide1down.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x3c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 3c <unknown>
+# CHECK-UNKNOWN: 3c456457 <unknown>
 
 vslide1down.vx v8, v4, a0
 # CHECK-INST: vslide1down.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x3e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 3e <unknown>
+# CHECK-UNKNOWN: 3e456457 <unknown>
 
 vrgather.vv v8, v4, v20, v0.t
 # CHECK-INST: vrgather.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x30]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 30 <unknown>
+# CHECK-UNKNOWN: 304a0457 <unknown>
 
 vrgather.vv v8, v4, v20
 # CHECK-INST: vrgather.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x32]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 32 <unknown>
+# CHECK-UNKNOWN: 324a0457 <unknown>
 
 vrgather.vx v8, v4, a0, v0.t
 # CHECK-INST: vrgather.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x30]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 30 <unknown>
+# CHECK-UNKNOWN: 30454457 <unknown>
 
 vrgather.vx v8, v4, a0
 # CHECK-INST: vrgather.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x32]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 32 <unknown>
+# CHECK-UNKNOWN: 32454457 <unknown>
 
 vrgather.vi v8, v4, 31, v0.t
 # CHECK-INST: vrgather.vi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0x30]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f 30 <unknown>
+# CHECK-UNKNOWN: 304fb457 <unknown>
 
 vrgather.vi v8, v4, 31
 # CHECK-INST: vrgather.vi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0x32]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f 32 <unknown>
+# CHECK-UNKNOWN: 324fb457 <unknown>
 
 vrgatherei16.vv v8, v4, v20, v0.t
 # CHECK-INST: vrgatherei16.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x38]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 38 <unknown>
+# CHECK-UNKNOWN: 384a0457 <unknown>
 
 vrgatherei16.vv v8, v4, v20
 # CHECK-INST: vrgatherei16.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x3a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 3a <unknown>
+# CHECK-UNKNOWN: 3a4a0457 <unknown>
 
 vcompress.vm v8, v4, v20
 # CHECK-INST: vcompress.vm v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x5e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 5e <unknown>
+# CHECK-UNKNOWN: 5e4a2457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/reduction.s b/llvm/test/MC/RISCV/rvv/reduction.s
index 2172589b7c38..006f54d9b545 100644
--- a/llvm/test/MC/RISCV/rvv/reduction.s
+++ b/llvm/test/MC/RISCV/rvv/reduction.s
@@ -12,124 +12,124 @@ vredsum.vs v8, v4, v20, v0.t
 # CHECK-INST: vredsum.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 00 <unknown>
+# CHECK-UNKNOWN: 004a2457 <unknown>
 
 vredsum.vs v8, v4, v20
 # CHECK-INST: vredsum.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 02 <unknown>
+# CHECK-UNKNOWN: 024a2457 <unknown>
 
 vredmaxu.vs v8, v4, v20, v0.t
 # CHECK-INST: vredmaxu.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x18]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 18 <unknown>
+# CHECK-UNKNOWN: 184a2457 <unknown>
 
 vredmaxu.vs v8, v4, v20
 # CHECK-INST: vredmaxu.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x1a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 1a <unknown>
+# CHECK-UNKNOWN: 1a4a2457 <unknown>
 
 vredmax.vs v8, v4, v20, v0.t
 # CHECK-INST: vredmax.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x1c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 1c <unknown>
+# CHECK-UNKNOWN: 1c4a2457 <unknown>
 
 vredmax.vs v8, v4, v20
 # CHECK-INST: vredmax.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x1e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 1e <unknown>
+# CHECK-UNKNOWN: 1e4a2457 <unknown>
 
 vredminu.vs v8, v4, v20, v0.t
 # CHECK-INST: vredminu.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x10]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 10 <unknown>
+# CHECK-UNKNOWN: 104a2457 <unknown>
 
 vredminu.vs v8, v4, v20
 # CHECK-INST: vredminu.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x12]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 12 <unknown>
+# CHECK-UNKNOWN: 124a2457 <unknown>
 
 vredmin.vs v8, v4, v20, v0.t
 # CHECK-INST: vredmin.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x14]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 14 <unknown>
+# CHECK-UNKNOWN: 144a2457 <unknown>
 
 vredmin.vs v8, v4, v20
 # CHECK-INST: vredmin.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x16]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 16 <unknown>
+# CHECK-UNKNOWN: 164a2457 <unknown>
 
 vredand.vs v8, v4, v20, v0.t
 # CHECK-INST: vredand.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x04]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 04 <unknown>
+# CHECK-UNKNOWN: 044a2457 <unknown>
 
 vredand.vs v8, v4, v20
 # CHECK-INST: vredand.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x06]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 06 <unknown>
+# CHECK-UNKNOWN: 064a2457 <unknown>
 
 vredor.vs v8, v4, v20, v0.t
 # CHECK-INST: vredor.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 08 <unknown>
+# CHECK-UNKNOWN: 084a2457 <unknown>
 
 vredor.vs v8, v4, v20
 # CHECK-INST: vredor.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 0a <unknown>
+# CHECK-UNKNOWN: 0a4a2457 <unknown>
 
 vredxor.vs v8, v4, v20, v0.t
 # CHECK-INST: vredxor.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 0c <unknown>
+# CHECK-UNKNOWN: 0c4a2457 <unknown>
 
 vredxor.vs v8, v4, v20
 # CHECK-INST: vredxor.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 0e <unknown>
+# CHECK-UNKNOWN: 0e4a2457 <unknown>
 
 vwredsumu.vs v8, v4, v20, v0.t
 # CHECK-INST: vwredsumu.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a c0 <unknown>
+# CHECK-UNKNOWN: c04a0457 <unknown>
 
 vwredsumu.vs v8, v4, v20
 # CHECK-INST: vwredsumu.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a c2 <unknown>
+# CHECK-UNKNOWN: c24a0457 <unknown>
 
 vwredsum.vs v8, v4, v20, v0.t
 # CHECK-INST: vwredsum.vs v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a c4 <unknown>
+# CHECK-UNKNOWN: c44a0457 <unknown>
 
 vwredsum.vs v8, v4, v20
 # CHECK-INST: vwredsum.vs v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a c6 <unknown>
+# CHECK-UNKNOWN: c64a0457 <unknown>
 
 vredsum.vs v0, v4, v20, v0.t
 # CHECK-INST: vredsum.vs v0, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x20,0x4a,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 20 4a 00 <unknown>
+# CHECK-UNKNOWN: 004a2057 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/shift.s b/llvm/test/MC/RISCV/rvv/shift.s
index 8a2e82f3c8fe..017e12499dc2 100644
--- a/llvm/test/MC/RISCV/rvv/shift.s
+++ b/llvm/test/MC/RISCV/rvv/shift.s
@@ -12,256 +12,256 @@ vsll.vv v8, v4, v20, v0.t
 # CHECK-INST: vsll.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x94]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 94 <unknown>
+# CHECK-UNKNOWN: 944a0457 <unknown>
 
 vsll.vv v8, v4, v20
 # CHECK-INST: vsll.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x96]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 96 <unknown>
+# CHECK-UNKNOWN: 964a0457 <unknown>
 
 vsll.vx v8, v4, a0, v0.t
 # CHECK-INST: vsll.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x94]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 94 <unknown>
+# CHECK-UNKNOWN: 94454457 <unknown>
 
 vsll.vx v8, v4, a0
 # CHECK-INST: vsll.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x96]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 96 <unknown>
+# CHECK-UNKNOWN: 96454457 <unknown>
 
 vsll.vi v8, v4, 31, v0.t
 # CHECK-INST: vsll.vi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0x94]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f 94 <unknown>
+# CHECK-UNKNOWN: 944fb457 <unknown>
 
 vsll.vi v8, v4, 31
 # CHECK-INST: vsll.vi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0x96]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f 96 <unknown>
+# CHECK-UNKNOWN: 964fb457 <unknown>
 
 vsrl.vv v8, v4, v20, v0.t
 # CHECK-INST: vsrl.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a a0 <unknown>
+# CHECK-UNKNOWN: a04a0457 <unknown>
 
 vsrl.vv v8, v4, v20
 # CHECK-INST: vsrl.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a a2 <unknown>
+# CHECK-UNKNOWN: a24a0457 <unknown>
 
 vsrl.vx v8, v4, a0, v0.t
 # CHECK-INST: vsrl.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 a0 <unknown>
+# CHECK-UNKNOWN: a0454457 <unknown>
 
 vsrl.vx v8, v4, a0
 # CHECK-INST: vsrl.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 a2 <unknown>
+# CHECK-UNKNOWN: a2454457 <unknown>
 
 vsrl.vi v8, v4, 31, v0.t
 # CHECK-INST: vsrl.vi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f a0 <unknown>
+# CHECK-UNKNOWN: a04fb457 <unknown>
 
 vsrl.vi v8, v4, 31
 # CHECK-INST: vsrl.vi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f a2 <unknown>
+# CHECK-UNKNOWN: a24fb457 <unknown>
 
 vsra.vv v8, v4, v20, v0.t
 # CHECK-INST: vsra.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a a4 <unknown>
+# CHECK-UNKNOWN: a44a0457 <unknown>
 
 vsra.vv v8, v4, v20
 # CHECK-INST: vsra.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a a6 <unknown>
+# CHECK-UNKNOWN: a64a0457 <unknown>
 
 vsra.vx v8, v4, a0, v0.t
 # CHECK-INST: vsra.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 a4 <unknown>
+# CHECK-UNKNOWN: a4454457 <unknown>
 
 vsra.vx v8, v4, a0
 # CHECK-INST: vsra.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 a6 <unknown>
+# CHECK-UNKNOWN: a6454457 <unknown>
 
 vsra.vi v8, v4, 31, v0.t
 # CHECK-INST: vsra.vi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f a4 <unknown>
+# CHECK-UNKNOWN: a44fb457 <unknown>
 
 vsra.vi v8, v4, 31
 # CHECK-INST: vsra.vi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f a6 <unknown>
+# CHECK-UNKNOWN: a64fb457 <unknown>
 
 vnsrl.wv v8, v4, v20, v0.t
 # CHECK-INST: vnsrl.wv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xb0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a b0 <unknown>
+# CHECK-UNKNOWN: b04a0457 <unknown>
 
 vnsrl.wv v4, v4, v20, v0.t
 # CHECK-INST: vnsrl.wv v4, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x02,0x4a,0xb0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 02 4a b0 <unknown>
+# CHECK-UNKNOWN: b04a0257 <unknown>
 
 vnsrl.wv v8, v4, v20
 # CHECK-INST: vnsrl.wv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xb2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a b2 <unknown>
+# CHECK-UNKNOWN: b24a0457 <unknown>
 
 vnsrl.wx v8, v4, a0, v0.t
 # CHECK-INST: vnsrl.wx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0xb0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 b0 <unknown>
+# CHECK-UNKNOWN: b0454457 <unknown>
 
 vnsrl.wx v8, v4, a0
 # CHECK-INST: vnsrl.wx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0xb2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 b2 <unknown>
+# CHECK-UNKNOWN: b2454457 <unknown>
 
 vnsrl.wi v8, v4, 31, v0.t
 # CHECK-INST: vnsrl.wi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xb0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f b0 <unknown>
+# CHECK-UNKNOWN: b04fb457 <unknown>
 
 vnsrl.wi v8, v4, 31
 # CHECK-INST: vnsrl.wi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xb2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f b2 <unknown>
+# CHECK-UNKNOWN: b24fb457 <unknown>
 
 vnsra.wv v8, v4, v20, v0.t
 # CHECK-INST: vnsra.wv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xb4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a b4 <unknown>
+# CHECK-UNKNOWN: b44a0457 <unknown>
 
 vnsra.wv v8, v4, v20
 # CHECK-INST: vnsra.wv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xb6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a b6 <unknown>
+# CHECK-UNKNOWN: b64a0457 <unknown>
 
 vnsra.wx v8, v4, a0, v0.t
 # CHECK-INST: vnsra.wx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0xb4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 b4 <unknown>
+# CHECK-UNKNOWN: b4454457 <unknown>
 
 vnsra.wx v8, v4, a0
 # CHECK-INST: vnsra.wx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0xb6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 b6 <unknown>
+# CHECK-UNKNOWN: b6454457 <unknown>
 
 vnsra.wi v8, v4, 31, v0.t
 # CHECK-INST: vnsra.wi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xb4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f b4 <unknown>
+# CHECK-UNKNOWN: b44fb457 <unknown>
 
 vnsra.wi v8, v4, 31
 # CHECK-INST: vnsra.wi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xb6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f b6 <unknown>
+# CHECK-UNKNOWN: b64fb457 <unknown>
 
 vssrl.vv v8, v4, v20, v0.t
 # CHECK-INST: vssrl.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a a8 <unknown>
+# CHECK-UNKNOWN: a84a0457 <unknown>
 
 vssrl.vv v8, v4, v20
 # CHECK-INST: vssrl.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a aa <unknown>
+# CHECK-UNKNOWN: aa4a0457 <unknown>
 
 vssrl.vx v8, v4, a0, v0.t
 # CHECK-INST: vssrl.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 a8 <unknown>
+# CHECK-UNKNOWN: a8454457 <unknown>
 
 vssrl.vx v8, v4, a0
 # CHECK-INST: vssrl.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 aa <unknown>
+# CHECK-UNKNOWN: aa454457 <unknown>
 
 vssrl.vi v8, v4, 31, v0.t
 # CHECK-INST: vssrl.vi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f a8 <unknown>
+# CHECK-UNKNOWN: a84fb457 <unknown>
 
 vssrl.vi v8, v4, 31
 # CHECK-INST: vssrl.vi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f aa <unknown>
+# CHECK-UNKNOWN: aa4fb457 <unknown>
 
 vssra.vv v8, v4, v20, v0.t
 # CHECK-INST: vssra.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a ac <unknown>
+# CHECK-UNKNOWN: ac4a0457 <unknown>
 
 vssra.vv v8, v4, v20
 # CHECK-INST: vssra.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a ae <unknown>
+# CHECK-UNKNOWN: ae4a0457 <unknown>
 
 vssra.vx v8, v4, a0, v0.t
 # CHECK-INST: vssra.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 ac <unknown>
+# CHECK-UNKNOWN: ac454457 <unknown>
 
 vssra.vx v8, v4, a0
 # CHECK-INST: vssra.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 ae <unknown>
+# CHECK-UNKNOWN: ae454457 <unknown>
 
 vssra.vi v8, v4, 31, v0.t
 # CHECK-INST: vssra.vi v8, v4, 31, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f ac <unknown>
+# CHECK-UNKNOWN: ac4fb457 <unknown>
 
 vssra.vi v8, v4, 31
 # CHECK-INST: vssra.vi v8, v4, 31
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f ae <unknown>
+# CHECK-UNKNOWN: ae4fb457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/sign-injection.s b/llvm/test/MC/RISCV/rvv/sign-injection.s
index 96d37c42d208..23e9be868a42 100644
--- a/llvm/test/MC/RISCV/rvv/sign-injection.s
+++ b/llvm/test/MC/RISCV/rvv/sign-injection.s
@@ -15,70 +15,70 @@ vfsgnj.vv v8, v4, v20, v0.t
 # CHECK-INST: vfsgnj.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x20]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 20 <unknown>
+# CHECK-UNKNOWN: 204a1457 <unknown>
 
 vfsgnj.vv v8, v4, v20
 # CHECK-INST: vfsgnj.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x22]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 22 <unknown>
+# CHECK-UNKNOWN: 224a1457 <unknown>
 
 vfsgnj.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfsgnj.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x20]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 20 <unknown>
+# CHECK-UNKNOWN: 20455457 <unknown>
 
 vfsgnj.vf v8, v4, fa0
 # CHECK-INST: vfsgnj.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x22]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 22 <unknown>
+# CHECK-UNKNOWN: 22455457 <unknown>
 
 vfsgnjn.vv v8, v4, v20, v0.t
 # CHECK-INST: vfsgnjn.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x24]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 24 <unknown>
+# CHECK-UNKNOWN: 244a1457 <unknown>
 
 vfsgnjn.vv v8, v4, v20
 # CHECK-INST: vfsgnjn.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x26]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 26 <unknown>
+# CHECK-UNKNOWN: 264a1457 <unknown>
 
 vfsgnjn.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfsgnjn.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 24 <unknown>
+# CHECK-UNKNOWN: 24455457 <unknown>
 
 vfsgnjn.vf v8, v4, fa0
 # CHECK-INST: vfsgnjn.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 26 <unknown>
+# CHECK-UNKNOWN: 26455457 <unknown>
 
 vfsgnjx.vv v8, v4, v20, v0.t
 # CHECK-INST: vfsgnjx.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x28]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 28 <unknown>
+# CHECK-UNKNOWN: 284a1457 <unknown>
 
 vfsgnjx.vv v8, v4, v20
 # CHECK-INST: vfsgnjx.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x14,0x4a,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 14 4a 2a <unknown>
+# CHECK-UNKNOWN: 2a4a1457 <unknown>
 
 vfsgnjx.vf v8, v4, fa0, v0.t
 # CHECK-INST: vfsgnjx.vf v8, v4, fa0, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0x28]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 28 <unknown>
+# CHECK-UNKNOWN: 28455457 <unknown>
 
 vfsgnjx.vf v8, v4, fa0
 # CHECK-INST: vfsgnjx.vf v8, v4, fa0
 # CHECK-ENCODING: [0x57,0x54,0x45,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V'{{.*}}'Zve32f' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 54 45 2a <unknown>
+# CHECK-UNKNOWN: 2a455457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/snippet.s b/llvm/test/MC/RISCV/rvv/snippet.s
index c032e468dede..c3e57e193f84 100644
--- a/llvm/test/MC/RISCV/rvv/snippet.s
+++ b/llvm/test/MC/RISCV/rvv/snippet.s
@@ -6,27 +6,27 @@
 
 loop:
     vsetvli a3, a0, e16,m4,ta,ma  # vtype = 16-bit integer vectors
-# CHECK-INST: d7 76 a5 0c    vsetvli a3, a0, e16, m4, ta, ma
+# CHECK-INST: 0ca576d7 vsetvli a3, a0, e16, m4, ta, ma
     vle16.v v4, (a1)              # Get 16b vector
-# CHECK-INST: 07 d2 05 02    vle16.v   v4, (a1)
+# CHECK-INST: 0205d207 vle16.v v4, (a1)
     slli t1, a3, 1                # Multiply length by two bytes/element
-# CHECK-INST: 13 93 16 00    slli    t1, a3, 0x1
+# CHECK-INST: 00169313 slli t1, a3, 0x1
     add a1, a1, t1                # Bump pointer
-# CHECK-INST: b3 85 65 00    add     a1, a1, t1
+# CHECK-INST: 006585b3 add a1, a1, t1
     vwmul.vx v8, v4, x10          # 32b in <v8--v15>
-# CHECK-INST: 57 64 45 ee    vwmul.vx        v8, v4, a0
+# CHECK-INST: ee456457 vwmul.vx v8, v4, a0
 
     vsetvli x0, a0, e32,m8,ta,ma  # Operate on 32b values
-# CHECK-INST: 57 70 35 0d    vsetvli zero, a0, e32, m8, ta, ma
+# CHECK-INST: 0d357057 vsetvli zero, a0, e32, m8, ta, ma
     vsrl.vi v8, v8, 3
-# CHECK-INST: 57 b4 81 a2    vsrl.vi v8, v8, 0x3
+# CHECK-INST: a281b457 vsrl.vi v8, v8, 0x3
     vse32.v v8, (a2)              # Store vector of 32b
-# CHECK-INST: 27 64 06 02    vse32.v   v8, (a2)
+# CHECK-INST: 02066427 vse32.v v8, (a2)
     slli t1, a3, 2                # Multiply length by four bytes/element
-# CHECK-INST: 13 93 26 00    slli    t1, a3, 0x2
+# CHECK-INST: 00269313 slli t1, a3, 0x2
     add a2, a2, t1                # Bump pointer
-# CHECK-INST: 33 06 66 00    add     a2, a2, t1
+# CHECK-INST: 00660633 add a2, a2, t1
     sub a0, a0, a3                # Decrement count
-# CHECK-INST: 33 05 d5 40    sub     a0, a0, a3
+# CHECK-INST: 40d50533 sub a0, a0, a3
     bnez a0, loop                 # Any more?
-# CHECK-INST: e3 1a 05 fc    bnez    a0, 0x0
+# CHECK-INST: fc051ae3 bnez a0, 0x0
diff --git a/llvm/test/MC/RISCV/rvv/store.s b/llvm/test/MC/RISCV/rvv/store.s
index a38f19f266fa..c6a34705fa4a 100644
--- a/llvm/test/MC/RISCV/rvv/store.s
+++ b/llvm/test/MC/RISCV/rvv/store.s
@@ -12,250 +12,250 @@ vsm.v v24, (a0)
 # CHECK-INST: vsm.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 02 <unknown>
+# CHECK-UNKNOWN: 02b50c27 <unknown>
 
 vse8.v v24, (a0), v0.t
 # CHECK-INST: vse8.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 00 <unknown>
+# CHECK-UNKNOWN: 00050c27 <unknown>
 
 vse8.v v24, (a0)
 # CHECK-INST: vse8.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 02 <unknown>
+# CHECK-UNKNOWN: 02050c27 <unknown>
 
 vse16.v v24, (a0), v0.t
 # CHECK-INST: vse16.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x05,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 00 <unknown>
+# CHECK-UNKNOWN: 00055c27 <unknown>
 
 vse16.v v24, (a0)
 # CHECK-INST: vse16.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x5c,0x05,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 02 <unknown>
+# CHECK-UNKNOWN: 02055c27 <unknown>
 
 vse32.v v24, (a0), v0.t
 # CHECK-INST: vse32.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 00 <unknown>
+# CHECK-UNKNOWN: 00056c27 <unknown>
 
 vse32.v v24, (a0)
 # CHECK-INST: vse32.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 02 <unknown>
+# CHECK-UNKNOWN: 02056c27 <unknown>
 
 vse64.v v24, (a0), v0.t
 # CHECK-INST: vse64.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x05,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 00 <unknown>
+# CHECK-UNKNOWN: 00057c27 <unknown>
 
 vse64.v v24, (a0)
 # CHECK-INST: vse64.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x7c,0x05,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 02 <unknown>
+# CHECK-UNKNOWN: 02057c27 <unknown>
 
 vsse8.v v24, (a0), a1, v0.t
 # CHECK-INST: vsse8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 08 <unknown>
+# CHECK-UNKNOWN: 08b50c27 <unknown>
 
 vsse8.v v24, (a0), a1
 # CHECK-INST: vsse8.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 0a <unknown>
+# CHECK-UNKNOWN: 0ab50c27 <unknown>
 
 vsse16.v v24, (a0), a1, v0.t
 # CHECK-INST: vsse16.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 08 <unknown>
+# CHECK-UNKNOWN: 08b55c27 <unknown>
 
 vsse16.v v24, (a0), a1
 # CHECK-INST: vsse16.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 0a <unknown>
+# CHECK-UNKNOWN: 0ab55c27 <unknown>
 
 vsse32.v v24, (a0), a1, v0.t
 # CHECK-INST: vsse32.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 08 <unknown>
+# CHECK-UNKNOWN: 08b56c27 <unknown>
 
 vsse32.v v24, (a0), a1
 # CHECK-INST: vsse32.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 0a <unknown>
+# CHECK-UNKNOWN: 0ab56c27 <unknown>
 
 vsse64.v v24, (a0), a1, v0.t
 # CHECK-INST: vsse64.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 08 <unknown>
+# CHECK-UNKNOWN: 08b57c27 <unknown>
 
 vsse64.v v24, (a0), a1
 # CHECK-INST: vsse64.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 0a <unknown>
+# CHECK-UNKNOWN: 0ab57c27 <unknown>
 
 vsuxei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x04]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 04 <unknown>
+# CHECK-UNKNOWN: 04450c27 <unknown>
 
 vsuxei8.v v24, (a0), v4
 # CHECK-INST: vsuxei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x06]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 06 <unknown>
+# CHECK-UNKNOWN: 06450c27 <unknown>
 
 vsuxei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x04]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 04 <unknown>
+# CHECK-UNKNOWN: 04455c27 <unknown>
 
 vsuxei16.v v24, (a0), v4
 # CHECK-INST: vsuxei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x06]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 06 <unknown>
+# CHECK-UNKNOWN: 06455c27 <unknown>
 
 vsuxei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x04]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 04 <unknown>
+# CHECK-UNKNOWN: 04456c27 <unknown>
 
 vsuxei32.v v24, (a0), v4
 # CHECK-INST: vsuxei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x06]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 06 <unknown>
+# CHECK-UNKNOWN: 06456c27 <unknown>
 
 vsuxei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x04]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 04 <unknown>
+# CHECK-UNKNOWN: 04457c27 <unknown>
 
 vsuxei64.v v24, (a0), v4
 # CHECK-INST: vsuxei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x06]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 06 <unknown>
+# CHECK-UNKNOWN: 06457c27 <unknown>
 
 vsoxei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 0c <unknown>
+# CHECK-UNKNOWN: 0c450c27 <unknown>
 
 vsoxei8.v v24, (a0), v4
 # CHECK-INST: vsoxei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 0e <unknown>
+# CHECK-UNKNOWN: 0e450c27 <unknown>
 
 vsoxei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 0c <unknown>
+# CHECK-UNKNOWN: 0c455c27 <unknown>
 
 vsoxei16.v v24, (a0), v4
 # CHECK-INST: vsoxei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 0e <unknown>
+# CHECK-UNKNOWN: 0e455c27 <unknown>
 
 vsoxei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 0c <unknown>
+# CHECK-UNKNOWN: 0c456c27 <unknown>
 
 vsoxei32.v v24, (a0), v4
 # CHECK-INST: vsoxei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 0e <unknown>
+# CHECK-UNKNOWN: 0e456c27 <unknown>
 
 vsoxei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 0c <unknown>
+# CHECK-UNKNOWN: 0c457c27 <unknown>
 
 vsoxei64.v v24, (a0), v4
 # CHECK-INST: vsoxei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 0e <unknown>
+# CHECK-UNKNOWN: 0e457c27 <unknown>
 
 vs1r.v v24, (a0)
 # CHECK-INST: vs1r.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x85,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 85 02 <unknown>
+# CHECK-UNKNOWN: 02850c27 <unknown>
 
 vs2r.v v24, (a0)
 # CHECK-INST: vs2r.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x85,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 85 22 <unknown>
+# CHECK-UNKNOWN: 22850c27 <unknown>
 
 vs4r.v v24, (a0)
 # CHECK-INST: vs4r.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x85,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 85 62 <unknown>
+# CHECK-UNKNOWN: 62850c27 <unknown>
 
 vs8r.v v24, (a0)
 # CHECK-INST: vs8r.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x85,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 85 e2 <unknown>
+# CHECK-UNKNOWN: e2850c27 <unknown>
 
 vsm.v v24, 0(a0)
 # CHECK-INST: vsm.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x02]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 02 <unknown>
+# CHECK-UNKNOWN: 02b50c27 <unknown>
 
 vse8.v v24, 0(a0), v0.t
 # CHECK-INST: vse8.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x00]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 00 <unknown>
+# CHECK-UNKNOWN: 00050c27 <unknown>
 
 vsse16.v v24, 0(a0), a1, v0.t
 # CHECK-INST: vsse16.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 08 <unknown>
+# CHECK-UNKNOWN: 08b55c27 <unknown>
 
 vsuxei8.v v24, 0(a0), v4, v0.t
 # CHECK-INST: vsuxei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x04]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 04 <unknown>
+# CHECK-UNKNOWN: 04450c27 <unknown>
 
 vsoxei32.v v24, 0(a0), v4, v0.t
 # CHECK-INST: vsoxei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 0c <unknown>
+# CHECK-UNKNOWN: 0c456c27 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/sub.s b/llvm/test/MC/RISCV/rvv/sub.s
index 3cc75ae730e7..6a637d9207ee 100644
--- a/llvm/test/MC/RISCV/rvv/sub.s
+++ b/llvm/test/MC/RISCV/rvv/sub.s
@@ -12,298 +12,298 @@ vsub.vv v8, v4, v20, v0.t
 # CHECK-INST: vsub.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 08 <unknown>
+# CHECK-UNKNOWN: 084a0457 <unknown>
 
 vsub.vv v8, v4, v20
 # CHECK-INST: vsub.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 0a <unknown>
+# CHECK-UNKNOWN: 0a4a0457 <unknown>
 
 vsub.vx v8, v4, a0, v0.t
 # CHECK-INST: vsub.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x08]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 08 <unknown>
+# CHECK-UNKNOWN: 08454457 <unknown>
 
 vsub.vx v8, v4, a0
 # CHECK-INST: vsub.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x0a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 0a <unknown>
+# CHECK-UNKNOWN: 0a454457 <unknown>
 
 vrsub.vx v8, v4, a0, v0.t
 # CHECK-INST: vrsub.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 0c <unknown>
+# CHECK-UNKNOWN: 0c454457 <unknown>
 
 vrsub.vx v8, v4, a0
 # CHECK-INST: vrsub.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 0e <unknown>
+# CHECK-UNKNOWN: 0e454457 <unknown>
 
 vrsub.vi v8, v4, 15, v0.t
 # CHECK-INST: vrsub.vi v8, v4, 15, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x0c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 0c <unknown>
+# CHECK-UNKNOWN: 0c47b457 <unknown>
 
 vrsub.vi v8, v4, 15
 # CHECK-INST: vrsub.vi v8, v4, 15
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x0e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 0e <unknown>
+# CHECK-UNKNOWN: 0e47b457 <unknown>
 
 vwsubu.vv v8, v4, v20, v0.t
 # CHECK-INST: vwsubu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a c8 <unknown>
+# CHECK-UNKNOWN: c84a2457 <unknown>
 
 vwsubu.vv v8, v4, v20
 # CHECK-INST: vwsubu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xca]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a ca <unknown>
+# CHECK-UNKNOWN: ca4a2457 <unknown>
 
 vwsubu.vx v8, v4, a0, v0.t
 # CHECK-INST: vwsubu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 c8 <unknown>
+# CHECK-UNKNOWN: c8456457 <unknown>
 
 vwsubu.vx v8, v4, a0
 # CHECK-INST: vwsubu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xca]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 ca <unknown>
+# CHECK-UNKNOWN: ca456457 <unknown>
 
 vwsub.vv v8, v4, v20, v0.t
 # CHECK-INST: vwsub.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a cc <unknown>
+# CHECK-UNKNOWN: cc4a2457 <unknown>
 
 vwsub.vv v8, v4, v20
 # CHECK-INST: vwsub.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a ce <unknown>
+# CHECK-UNKNOWN: ce4a2457 <unknown>
 
 vwsub.vx v8, v4, a0, v0.t
 # CHECK-INST: vwsub.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 cc <unknown>
+# CHECK-UNKNOWN: cc456457 <unknown>
 
 vwsub.vx v8, v4, a0
 # CHECK-INST: vwsub.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 ce <unknown>
+# CHECK-UNKNOWN: ce456457 <unknown>
 
 vwsubu.wv v8, v4, v20, v0.t
 # CHECK-INST: vwsubu.wv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xd8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a d8 <unknown>
+# CHECK-UNKNOWN: d84a2457 <unknown>
 
 vwsubu.wv v8, v4, v20
 # CHECK-INST: vwsubu.wv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xda]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a da <unknown>
+# CHECK-UNKNOWN: da4a2457 <unknown>
 
 vwsubu.wx v8, v4, a0, v0.t
 # CHECK-INST: vwsubu.wx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xd8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 d8 <unknown>
+# CHECK-UNKNOWN: d8456457 <unknown>
 
 vwsubu.wx v8, v4, a0
 # CHECK-INST: vwsubu.wx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xda]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 da <unknown>
+# CHECK-UNKNOWN: da456457 <unknown>
 
 vwsub.wv v8, v4, v20, v0.t
 # CHECK-INST: vwsub.wv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xdc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a dc <unknown>
+# CHECK-UNKNOWN: dc4a2457 <unknown>
 
 vwsub.wv v8, v4, v20
 # CHECK-INST: vwsub.wv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0xde]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a de <unknown>
+# CHECK-UNKNOWN: de4a2457 <unknown>
 
 vwsub.wx v8, v4, a0, v0.t
 # CHECK-INST: vwsub.wx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0xdc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 dc <unknown>
+# CHECK-UNKNOWN: dc456457 <unknown>
 
 vwsub.wx v8, v4, a0
 # CHECK-INST: vwsub.wx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0xde]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 de <unknown>
+# CHECK-UNKNOWN: de456457 <unknown>
 
 vsbc.vvm v8, v4, v20, v0
 # CHECK-INST: vsbc.vvm v8, v4, v20, v0
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 48 <unknown>
+# CHECK-UNKNOWN: 484a0457 <unknown>
 
 vsbc.vvm v4, v4, v20, v0
 # CHECK-INST: vsbc.vvm v4, v4, v20, v0
 # CHECK-ENCODING: [0x57,0x02,0x4a,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 02 4a 48 <unknown>
+# CHECK-UNKNOWN: 484a0257 <unknown>
 
 vsbc.vvm v8, v4, v8, v0
 # CHECK-INST: vsbc.vvm v8, v4, v8, v0
 # CHECK-ENCODING: [0x57,0x04,0x44,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 44 48 <unknown>
+# CHECK-UNKNOWN: 48440457 <unknown>
 
 vsbc.vxm v8, v4, a0, v0
 # CHECK-INST: vsbc.vxm v8, v4, a0, v0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 48 <unknown>
+# CHECK-UNKNOWN: 48454457 <unknown>
 
 vmsbc.vvm v8, v4, v20, v0
 # CHECK-INST: vmsbc.vvm v8, v4, v20, v0
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 4c <unknown>
+# CHECK-UNKNOWN: 4c4a0457 <unknown>
 
 vmsbc.vvm v4, v4, v20, v0
 # CHECK-INST: vmsbc.vvm v4, v4, v20, v0
 # CHECK-ENCODING: [0x57,0x02,0x4a,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 02 4a 4c <unknown>
+# CHECK-UNKNOWN: 4c4a0257 <unknown>
 
 vmsbc.vvm v8, v4, v8, v0
 # CHECK-INST: vmsbc.vvm v8, v4, v8, v0
 # CHECK-ENCODING: [0x57,0x04,0x44,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 44 4c <unknown>
+# CHECK-UNKNOWN: 4c440457 <unknown>
 
 vmsbc.vxm v8, v4, a0, v0
 # CHECK-INST: vmsbc.vxm v8, v4, a0, v0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 4c <unknown>
+# CHECK-UNKNOWN: 4c454457 <unknown>
 
 vmsbc.vv v8, v4, v20
 # CHECK-INST: vmsbc.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 4e <unknown>
+# CHECK-UNKNOWN: 4e4a0457 <unknown>
 
 vmsbc.vx v8, v4, a0
 # CHECK-INST: vmsbc.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 4e <unknown>
+# CHECK-UNKNOWN: 4e454457 <unknown>
 
 vssubu.vv v8, v4, v20, v0.t
 # CHECK-INST: vssubu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 88 <unknown>
+# CHECK-UNKNOWN: 884a0457 <unknown>
 
 vssubu.vv v8, v4, v20
 # CHECK-INST: vssubu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 8a <unknown>
+# CHECK-UNKNOWN: 8a4a0457 <unknown>
 
 vssubu.vx v8, v4, a0, v0.t
 # CHECK-INST: vssubu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 88 <unknown>
+# CHECK-UNKNOWN: 88454457 <unknown>
 
 vssubu.vx v8, v4, a0
 # CHECK-INST: vssubu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 8a <unknown>
+# CHECK-UNKNOWN: 8a454457 <unknown>
 
 vssub.vv v8, v4, v20, v0.t
 # CHECK-INST: vssub.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 8c <unknown>
+# CHECK-UNKNOWN: 8c4a0457 <unknown>
 
 vssub.vv v8, v4, v20
 # CHECK-INST: vssub.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 8e <unknown>
+# CHECK-UNKNOWN: 8e4a0457 <unknown>
 
 vssub.vx v8, v4, a0, v0.t
 # CHECK-INST: vssub.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 8c <unknown>
+# CHECK-UNKNOWN: 8c454457 <unknown>
 
 vssub.vx v8, v4, a0
 # CHECK-INST: vssub.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 8e <unknown>
+# CHECK-UNKNOWN: 8e454457 <unknown>
 
 vasub.vv v8, v4, v20, v0.t
 # CHECK-INST: vasub.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 2c <unknown>
+# CHECK-UNKNOWN: 2c4a2457 <unknown>
 
 vasub.vv v8, v4, v20
 # CHECK-INST: vasub.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 2e <unknown>
+# CHECK-UNKNOWN: 2e4a2457 <unknown>
 
 vasub.vx v8, v4, a0, v0.t
 # CHECK-INST: vasub.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 2c <unknown>
+# CHECK-UNKNOWN: 2c456457 <unknown>
 
 vasub.vx v8, v4, a0
 # CHECK-INST: vasub.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 2e <unknown>
+# CHECK-UNKNOWN: 2e456457 <unknown>
 
 vasubu.vv v8, v4, v20, v0.t
 # CHECK-INST: vasubu.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 28 <unknown>
+# CHECK-UNKNOWN: 284a2457 <unknown>
 
 vasubu.vv v8, v4, v20
 # CHECK-INST: vasubu.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x24,0x4a,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 24 4a 2a <unknown>
+# CHECK-UNKNOWN: 2a4a2457 <unknown>
 
 vasubu.vx v8, v4, a0, v0.t
 # CHECK-INST: vasubu.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x64,0x45,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 28 <unknown>
+# CHECK-UNKNOWN: 28456457 <unknown>
 
 vasubu.vx v8, v4, a0
 # CHECK-INST: vasubu.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x64,0x45,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 64 45 2a <unknown>
+# CHECK-UNKNOWN: 2a456457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/vsetvl.s b/llvm/test/MC/RISCV/rvv/vsetvl.s
index 69a48d24f190..c9197d8917a4 100644
--- a/llvm/test/MC/RISCV/rvv/vsetvl.s
+++ b/llvm/test/MC/RISCV/rvv/vsetvl.s
@@ -13,149 +13,149 @@ vsetvli a2, a0, 0x224
 # CHECK-INST: vsetvli a2, a0, 548
 # CHECK-ENCODING: [0x57,0x76,0x45,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 45 22 <unknown>
+# CHECK-UNKNOWN: 22457657 <unknown>
 
 vsetvli a2, a0, 0xd0
 # CHECK-INST: vsetvli a2, a0, e32, m1, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x05,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 05 0d <unknown>
+# CHECK-UNKNOWN: 0d057657 <unknown>
 
 vsetvli a2, a0, 0xd1
 # CHECK-INST: vsetvli a2, a0, e32, m2, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x15,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 15 0d <unknown>
+# CHECK-UNKNOWN: 0d157657 <unknown>
 
 vsetvli a2, a0, 0x50
 # CHECK-INST: vsetvli a2, a0, e32, m1, ta, mu
 # CHECK-ENCODING: [0x57,0x76,0x05,0x05]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 05 05 <unknown>
+# CHECK-UNKNOWN: 05057657 <unknown>
 
 vsetvli a2, a0, 0x90
 # CHECK-INST: vsetvli a2, a0, e32, m1, tu, ma
 # CHECK-ENCODING: [0x57,0x76,0x05,0x09]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 05 09 <unknown>
+# CHECK-UNKNOWN: 09057657 <unknown>
 
 vsetvli a2, a0, 144
 # CHECK-INST: vsetvli a2, a0, e32, m1, tu, ma
 # CHECK-ENCODING: [0x57,0x76,0x05,0x09]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 05 09 <unknown>
+# CHECK-UNKNOWN: 09057657 <unknown>
 
 vsetvli a2, a0, e32, m1, ta, ma
 # CHECK-INST: vsetvli a2, a0, e32,  m1,  ta,  ma
 # CHECK-ENCODING: [0x57,0x76,0x05,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 05 0d <unknown>
+# CHECK-UNKNOWN: 0d057657 <unknown>
 
 vsetvli a2, a0, e32, m2, ta, ma
 # CHECK-INST: vsetvli a2, a0, e32, m2, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x15,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 15 0d <unknown>
+# CHECK-UNKNOWN: 0d157657 <unknown>
 
 vsetvli a2, a0, e32, m4, ta, ma
 # CHECK-INST: vsetvli a2, a0, e32, m4, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x25,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 25 0d <unknown>
+# CHECK-UNKNOWN: 0d257657 <unknown>
 
 vsetvli a2, a0, e32, m8, ta, ma
 # CHECK-INST: vsetvli a2, a0, e32, m8, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x35,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 35 0d <unknown>
+# CHECK-UNKNOWN: 0d357657 <unknown>
 
 vsetvli a2, a0, e32, mf2, ta, ma
 # CHECK-INST: vsetvli a2, a0, e32, mf2, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x75,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 75 0d <unknown>
+# CHECK-UNKNOWN: 0d757657 <unknown>
 
 vsetvli a2, a0, e32, mf4, ta, ma
 # CHECK-INST: vsetvli a2, a0, e32, mf4, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x65,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 65 0d <unknown>
+# CHECK-UNKNOWN: 0d657657 <unknown>
 
 vsetvli a2, a0, e32, mf8, ta, ma
 # CHECK-INST: vsetvli a2, a0, e32, mf8, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x55,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 55 0d <unknown>
+# CHECK-UNKNOWN: 0d557657 <unknown>
 
 vsetvli a2, a0, e32, m1, ta, ma
 # CHECK-INST: vsetvli a2, a0, e32, m1, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x05,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 05 0d <unknown>
+# CHECK-UNKNOWN: 0d057657 <unknown>
 
 vsetvli a2, a0, e32, m1, tu, ma
 # CHECK-INST: vsetvli a2, a0, e32, m1, tu, ma
 # CHECK-ENCODING: [0x57,0x76,0x05,0x09]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 05 09 <unknown>
+# CHECK-UNKNOWN: 09057657 <unknown>
 
 vsetvli a2, a0, e32, m1, ta, mu
 # CHECK-INST: vsetvli a2, a0, e32, m1, ta, mu
 # CHECK-ENCODING: [0x57,0x76,0x05,0x05]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 05 05 <unknown>
+# CHECK-UNKNOWN: 05057657 <unknown>
 
 vsetvli a2, a0, e32, m1, tu, mu
 # CHECK-INST: vsetvli a2, a0, e32, m1
 # CHECK-ENCODING: [0x57,0x76,0x05,0x01]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 05 01 <unknown>
+# CHECK-UNKNOWN: 01057657 <unknown>
 
 vsetvl a2, a0, a1
 # CHECK-INST: vsetvl a2, a0, a1
 # CHECK-ENCODING: [0x57,0x76,0xb5,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 b5 80 <unknown>
+# CHECK-UNKNOWN: 80b57657 <unknown>
 
 # reserved filed: vlmul[2:0]=4, vsew[2:0]=0b1xx, non-zero bits 8/9/10.
 vsetivli a2, 0, 0x224
 # CHECK-INST: vsetivli a2, 0, 548
 # CHECK-ENCODING: [0x57,0x76,0x40,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 40 e2 <unknown>
+# CHECK-UNKNOWN: e2407657 <unknown>
 
 vsetivli a2, 0, 0xd0
 # CHECK-INST: vsetivli a2, 0, e32, m1, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x00,0xcd]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 00 cd <unknown>
+# CHECK-UNKNOWN: cd007657 <unknown>
 
 vsetivli a2, 15, 0xd0
 # CHECK-INST: vsetivli a2, 15, e32, m1, ta, ma
 # CHECK-ENCODING: [0x57,0xf6,0x07,0xcd]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 f6 07 cd <unknown>
+# CHECK-UNKNOWN: cd07f657 <unknown>
 
 vsetivli a2, 15, 208
 # CHECK-INST: vsetivli a2, 15, e32, m1, ta, ma
 # CHECK-ENCODING: [0x57,0xf6,0x07,0xcd]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 f6 07 cd <unknown>
+# CHECK-UNKNOWN: cd07f657 <unknown>
 
 vsetivli a2, 0, e32, m1, ta, ma
 # CHECK-INST: vsetivli a2, 0, e32, m1, ta, ma
 # CHECK-ENCODING: [0x57,0x76,0x00,0xcd]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 76 00 cd <unknown>
+# CHECK-UNKNOWN: cd007657 <unknown>
 
 vsetivli a2, 15, e32, m1, ta, ma
 # CHECK-INST: vsetivli a2, 15, e32, m1, ta, ma
 # CHECK-ENCODING: [0x57,0xf6,0x07,0xcd]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 f6 07 cd <unknown>
+# CHECK-UNKNOWN: cd07f657 <unknown>
 
 vsetivli a2, 31, e32, m1, ta, ma
 # CHECK-INST: vsetivli a2, 31, e32, m1, ta, ma
 # CHECK-ENCODING: [0x57,0xf6,0x0f,0xcd]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 f6 0f cd <unknown>
+# CHECK-UNKNOWN: cd0ff657 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/xor.s b/llvm/test/MC/RISCV/rvv/xor.s
index 5ea0f694e0d6..572388ed2267 100644
--- a/llvm/test/MC/RISCV/rvv/xor.s
+++ b/llvm/test/MC/RISCV/rvv/xor.s
@@ -12,46 +12,46 @@ vxor.vv v8, v4, v20, v0.t
 # CHECK-INST: vxor.vv v8, v4, v20, v0.t
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 2c <unknown>
+# CHECK-UNKNOWN: 2c4a0457 <unknown>
 
 vxor.vv v8, v4, v20
 # CHECK-INST: vxor.vv v8, v4, v20
 # CHECK-ENCODING: [0x57,0x04,0x4a,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 04 4a 2e <unknown>
+# CHECK-UNKNOWN: 2e4a0457 <unknown>
 
 vxor.vx v8, v4, a0, v0.t
 # CHECK-INST: vxor.vx v8, v4, a0, v0.t
 # CHECK-ENCODING: [0x57,0x44,0x45,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 2c <unknown>
+# CHECK-UNKNOWN: 2c454457 <unknown>
 
 vxor.vx v8, v4, a0
 # CHECK-INST: vxor.vx v8, v4, a0
 # CHECK-ENCODING: [0x57,0x44,0x45,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 44 45 2e <unknown>
+# CHECK-UNKNOWN: 2e454457 <unknown>
 
 vxor.vi v8, v4, 15, v0.t
 # CHECK-INST: vxor.vi v8, v4, 15, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 2c <unknown>
+# CHECK-UNKNOWN: 2c47b457 <unknown>
 
 vxor.vi v8, v4, 15
 # CHECK-INST: vxor.vi v8, v4, 15
 # CHECK-ENCODING: [0x57,0xb4,0x47,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 47 2e <unknown>
+# CHECK-UNKNOWN: 2e47b457 <unknown>
 
 vnot.v v8, v4, v0.t
 # CHECK-INST: vnot.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f 2c <unknown>
+# CHECK-UNKNOWN: 2c4fb457 <unknown>
 
 vnot.v v8, v4
 # CHECK-INST: vnot.v v8, v4
 # CHECK-ENCODING: [0x57,0xb4,0x4f,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 57 b4 4f 2e <unknown>
+# CHECK-UNKNOWN: 2e4fb457 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/xsfvcp.s b/llvm/test/MC/RISCV/rvv/xsfvcp.s
index a137311f575a..4298bf7a7b7a 100644
--- a/llvm/test/MC/RISCV/rvv/xsfvcp.s
+++ b/llvm/test/MC/RISCV/rvv/xsfvcp.s
@@ -21,166 +21,166 @@ sf.vc.x 0x3, 0xf, 0x1f, a1
 # CHECK-INST: sf.vc.x 3, 15, 31, a1
 # CHECK-ENCODING: [0xdb,0xcf,0xf5,0x0e]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: db cf f5 0e <unknown>
+# CHECK-UNKNOWN: 0ef5cfdb <unknown>
 
 sf.vc.i 0x3, 0xf, 0x1f, 15
 # CHECK-INST: sf.vc.i 3, 15, 31, 15
 # CHECK-ENCODING: [0xdb,0xbf,0xf7,0x0e]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: db bf f7 0e <unknown>
+# CHECK-UNKNOWN: 0ef7bfdb <unknown>
 
 sf.vc.vv 0x3, 0x1f, v2, v1
 # CHECK-INST: sf.vc.vv 3, 31, v2, v1
 # CHECK-ENCODING: [0xdb,0x8f,0x20,0x2e]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: db 8f 20 2e <unknown>
+# CHECK-UNKNOWN: 2e208fdb <unknown>
 
 sf.vc.xv 0x3, 0x1f, v2, a1
 # CHECK-INST: sf.vc.xv 3, 31, v2, a1
 # CHECK-ENCODING: [0xdb,0xcf,0x25,0x2e]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: db cf 25 2e <unknown>
+# CHECK-UNKNOWN: 2e25cfdb <unknown>
 
 sf.vc.iv 0x3, 0x1f, v2, 15
 # CHECK-INST: sf.vc.iv 3, 31, v2, 15
 # CHECK-ENCODING: [0xdb,0xbf,0x27,0x2e]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: db bf 27 2e <unknown>
+# CHECK-UNKNOWN: 2e27bfdb <unknown>
 
 sf.vc.fv 0x1, 0x1f, v2, fa1
 # CHECK-INST: sf.vc.fv 1, 31, v2, fa1
 # CHECK-ENCODING: [0xdb,0xdf,0x25,0x2e]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: db df 25 2e <unknown>
+# CHECK-UNKNOWN: 2e25dfdb <unknown>
 
 sf.vc.vvv 0x3, v0, v2, v1
 # CHECK-INST: sf.vc.vvv 3, v0, v2, v1
 # CHECK-ENCODING: [0x5b,0x80,0x20,0xae]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b 80 20 ae <unknown>
+# CHECK-UNKNOWN: ae20805b <unknown>
 
 sf.vc.xvv 0x3, v0, v2, a1
 # CHECK-INST: sf.vc.xvv 3, v0, v2, a1
 # CHECK-ENCODING: [0x5b,0xc0,0x25,0xae]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b c0 25 ae <unknown>
+# CHECK-UNKNOWN: ae25c05b <unknown>
 
 sf.vc.ivv 0x3, v0, v2, 15
 # CHECK-INST: sf.vc.ivv 3, v0, v2, 15
 # CHECK-ENCODING: [0x5b,0xb0,0x27,0xae]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b b0 27 ae <unknown>
+# CHECK-UNKNOWN: ae27b05b <unknown>
 
 sf.vc.fvv 0x1, v0, v2, fa1
 # CHECK-INST: sf.vc.fvv 1, v0, v2, fa1
 # CHECK-ENCODING: [0x5b,0xd0,0x25,0xae]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b d0 25 ae <unknown>
+# CHECK-UNKNOWN: ae25d05b <unknown>
 
 sf.vc.vvw 0x3, v0, v2, v1
 # CHECK-INST: sf.vc.vvw 3, v0, v2, v1
 # CHECK-ENCODING: [0x5b,0x80,0x20,0xfe]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b 80 20 fe <unknown>
+# CHECK-UNKNOWN: fe20805b <unknown>
 
 sf.vc.xvw 0x3, v0, v2, a1
 # CHECK-INST: sf.vc.xvw 3, v0, v2, a1
 # CHECK-ENCODING: [0x5b,0xc0,0x25,0xfe]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b c0 25 fe <unknown>
+# CHECK-UNKNOWN: fe25c05b <unknown>
 
 sf.vc.ivw 0x3, v0, v2, 15
 # CHECK-INST: sf.vc.ivw 3, v0, v2, 15
 # CHECK-ENCODING: [0x5b,0xb0,0x27,0xfe]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b b0 27 fe <unknown>
+# CHECK-UNKNOWN: fe27b05b <unknown>
 
 sf.vc.fvw 0x1, v0, v2, fa1
 # CHECK-INST: sf.vc.fvw 1, v0, v2, fa1
 # CHECK-ENCODING: [0x5b,0xd0,0x25,0xfe]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b d0 25 fe <unknown>
+# CHECK-UNKNOWN: fe25d05b <unknown>
 
 sf.vc.v.x 0x3, 0xf, v0, a1
 # CHECK-INST: sf.vc.v.x 3, 15, v0, a1
 # CHECK-ENCODING: [0x5b,0xc0,0xf5,0x0c]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b c0 f5 0c <unknown>
+# CHECK-UNKNOWN: 0cf5c05b <unknown>
 
 sf.vc.v.i 0x3, 0xf, v0, 15
 # CHECK-INST: sf.vc.v.i 3, 15, v0, 15
 # CHECK-ENCODING: [0x5b,0xb0,0xf7,0x0c]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b b0 f7 0c <unknown>
+# CHECK-UNKNOWN: 0cf7b05b <unknown>
 
 sf.vc.v.vv 0x3, v0, v2, v1
 # CHECK-INST: sf.vc.v.vv 3, v0, v2, v1
 # CHECK-ENCODING: [0x5b,0x80,0x20,0x2c]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b 80 20 2c <unknown>
+# CHECK-UNKNOWN: 2c20805b <unknown>
 
 sf.vc.v.xv 0x3, v0, v2, a1
 # CHECK-INST: sf.vc.v.xv 3, v0, v2, a1
 # CHECK-ENCODING: [0x5b,0xc0,0x25,0x2c]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b c0 25 2c <unknown>
+# CHECK-UNKNOWN: 2c25c05b <unknown>
 
 sf.vc.v.iv 0x3, v0, v2, 15
 # CHECK-INST: sf.vc.v.iv 3, v0, v2, 15
 # CHECK-ENCODING: [0x5b,0xb0,0x27,0x2c]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b b0 27 2c <unknown>
+# CHECK-UNKNOWN: 2c27b05b <unknown>
 
 sf.vc.v.fv 0x1, v0, v2, fa1
 # CHECK-INST: sf.vc.v.fv 1, v0, v2, fa1
 # CHECK-ENCODING: [0x5b,0xd0,0x25,0x2c]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b d0 25 2c <unknown>
+# CHECK-UNKNOWN: 2c25d05b <unknown>
 
 sf.vc.v.vvv 0x3, v0, v2, v1
 # CHECK-INST: sf.vc.v.vvv 3, v0, v2, v1
 # CHECK-ENCODING: [0x5b,0x80,0x20,0xac]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b 80 20 ac <unknown>
+# CHECK-UNKNOWN: ac20805b <unknown>
 
 sf.vc.v.xvv 0x3, v0, v2, a1
 # CHECK-INST: sf.vc.v.xvv 3, v0, v2, a1
 # CHECK-ENCODING: [0x5b,0xc0,0x25,0xac]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b c0 25 ac <unknown>
+# CHECK-UNKNOWN: ac25c05b <unknown>
 
 sf.vc.v.ivv 0x3, v0, v2, 15
 # CHECK-INST: sf.vc.v.ivv 3, v0, v2, 15
 # CHECK-ENCODING: [0x5b,0xb0,0x27,0xac]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b b0 27 ac <unknown>
+# CHECK-UNKNOWN: ac27b05b <unknown>
 
 sf.vc.v.fvv 0x1, v0, v2, fa1
 # CHECK-INST: sf.vc.v.fvv 1, v0, v2, fa1
 # CHECK-ENCODING: [0x5b,0xd0,0x25,0xac]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b d0 25 ac <unknown>
+# CHECK-UNKNOWN: ac25d05b <unknown>
 
 sf.vc.v.vvw 0x3, v0, v2, v1
 # CHECK-INST: sf.vc.v.vvw 3, v0, v2, v1
 # CHECK-ENCODING: [0x5b,0x80,0x20,0xfc]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b 80 20 fc <unknown>
+# CHECK-UNKNOWN: fc20805b <unknown>
 
 sf.vc.v.xvw 0x3, v0, v2, a1
 # CHECK-INST: sf.vc.v.xvw 3, v0, v2, a1
 # CHECK-ENCODING: [0x5b,0xc0,0x25,0xfc]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b c0 25 fc <unknown>
+# CHECK-UNKNOWN: fc25c05b <unknown>
 
 sf.vc.v.ivw 0x3, v0, v2, 15
 # CHECK-INST: sf.vc.v.ivw 3, v0, v2, 15
 # CHECK-ENCODING: [0x5b,0xb0,0x27,0xfc]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b b0 27 fc <unknown>
+# CHECK-UNKNOWN: fc27b05b <unknown>
 
 sf.vc.v.fvw 0x1, v0, v2, fa1
 # CHECK-INST: sf.vc.v.fvw 1, v0, v2, fa1
 # CHECK-ENCODING: [0x5b,0xd0,0x25,0xfc]
 # CHECK-ERROR: instruction requires the following: 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions){{$}}
-# CHECK-UNKNOWN: 5b d0 25 fc <unknown>
+# CHECK-UNKNOWN: fc25d05b <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/xsfvfnrclip.s b/llvm/test/MC/RISCV/rvv/xsfvfnrclip.s
index d8b184659ac4..7508d44bc916 100644
--- a/llvm/test/MC/RISCV/rvv/xsfvfnrclip.s
+++ b/llvm/test/MC/RISCV/rvv/xsfvfnrclip.s
@@ -12,22 +12,22 @@ sf.vfnrclip.xu.f.qf v4, v8, fa2
 # CHECK-INST: sf.vfnrclip.xu.f.qf v4, v8, fa2
 # CHECK-ENCODING: [0x5b,0x52,0x86,0x8a]
 # CHECK-ERROR: instruction requires the following: 'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions)
-# CHECK-UNKNOWN: 5b 52 86 8a <unknown>
+# CHECK-UNKNOWN: 8a86525b <unknown>
 
 sf.vfnrclip.xu.f.qf v4, v8, fa2, v0.t
 # CHECK-INST: sf.vfnrclip.xu.f.qf v4, v8, fa2
 # CHECK-ENCODING: [0x5b,0x52,0x86,0x88]
 # CHECK-ERROR: instruction requires the following: 'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions)
-# CHECK-UNKNOWN: 5b 52 86 88 <unknown>
+# CHECK-UNKNOWN: 8886525b <unknown>
 
 sf.vfnrclip.x.f.qf v4, v8, fa2
 # CHECK-INST: sf.vfnrclip.x.f.qf v4, v8, fa2
 # CHECK-ENCODING: [0x5b,0x52,0x86,0x8e]
 # CHECK-ERROR: instruction requires the following: 'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions)
-# CHECK-UNKNOWN: 5b 52 86 8e <unknown>
+# CHECK-UNKNOWN: 8e86525b <unknown>
 
 sf.vfnrclip.x.f.qf v4, v8, fa2, v0.t
 # CHECK-INST: sf.vfnrclip.x.f.qf v4, v8, fa2
 # CHECK-ENCODING: [0x5b,0x52,0x86,0x8c]
 # CHECK-ERROR: instruction requires the following: 'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions)
-# CHECK-UNKNOWN: 5b 52 86 8c <unknown>
+# CHECK-UNKNOWN: 8c86525b <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/xsfvfwmacc.s b/llvm/test/MC/RISCV/rvv/xsfvfwmacc.s
index ba054fff2bd8..a9843c350fc8 100644
--- a/llvm/test/MC/RISCV/rvv/xsfvfwmacc.s
+++ b/llvm/test/MC/RISCV/rvv/xsfvfwmacc.s
@@ -12,4 +12,4 @@ sf.vfwmacc.4x4x4 v8, v4, v20
 # CHECK-INST: sf.vfwmacc.4x4x4 v8, v4, v20
 # CHECK-ENCODING: [0x5b,0x14,0x42,0xf3]
 # CHECK-ERROR: instruction requires the following: 'XSfvfwmaccqqq' (SiFive Matrix Multiply Accumulate Instruction and 4-by-4))
-# CHECK-UNKNOWN: 5b 14 42 f3 <unknown>
+# CHECK-UNKNOWN: f342145b <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/xsfvqmacc.s b/llvm/test/MC/RISCV/rvv/xsfvqmacc.s
index ba19f2184486..81703c847d74 100644
--- a/llvm/test/MC/RISCV/rvv/xsfvqmacc.s
+++ b/llvm/test/MC/RISCV/rvv/xsfvqmacc.s
@@ -12,46 +12,46 @@ sf.vqmaccu.2x8x2 v8, v4, v20
 # CHECK-INST: sf.vqmaccu.2x8x2 v8, v4, v20
 # CHECK-ENCODING: [0x5b,0x24,0x42,0xb3]
 # CHECK-ERROR: instruction requires the following: 'XSfvqmaccdod' (SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2))
-# CHECK-UNKNOWN: 5b 24 42 b3 <unknown>
+# CHECK-UNKNOWN: b342245b <unknown>
 
 sf.vqmacc.2x8x2 v8, v4, v20
 # CHECK-INST: sf.vqmacc.2x8x2 v8, v4, v20
 # CHECK-ENCODING: [0x5b,0x24,0x42,0xb7]
 # CHECK-ERROR: instruction requires the following: 'XSfvqmaccdod' (SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2))
-# CHECK-UNKNOWN: 5b 24 42 b7 <unknown>
+# CHECK-UNKNOWN: b742245b <unknown>
 
 sf.vqmaccus.2x8x2 v8, v4, v20
 # CHECK-INST: sf.vqmaccus.2x8x2 v8, v4, v20
 # CHECK-ENCODING: [0x5b,0x24,0x42,0xbb]
 # CHECK-ERROR: instruction requires the following: 'XSfvqmaccdod' (SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2))
-# CHECK-UNKNOWN: 5b 24 42 bb <unknown>
+# CHECK-UNKNOWN: bb42245b <unknown>
 
 sf.vqmaccsu.2x8x2 v8, v4, v20
 # CHECK-INST: sf.vqmaccsu.2x8x2 v8, v4, v20
 # CHECK-ENCODING: [0x5b,0x24,0x42,0xbf]
 # CHECK-ERROR: instruction requires the following: 'XSfvqmaccdod' (SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2))
-# CHECK-UNKNOWN: 5b 24 42 bf <unknown>
+# CHECK-UNKNOWN: bf42245b <unknown>
 
 sf.vqmaccu.4x8x4 v8, v4, v20
 # CHECK-INST: sf.vqmaccu.4x8x4 v8, v4, v20
 # CHECK-ENCODING: [0x5b,0x24,0x42,0xf3]
 # CHECK-ERROR: instruction requires the following: 'XSfvqmaccqoq' (SiFive Int8 Matrix Multiplication Instructions (4-by-8 and 8-by-4))
-# CHECK-UNKNOWN: 5b 24 42 f3 <unknown>
+# CHECK-UNKNOWN: f342245b <unknown>
 
 sf.vqmacc.4x8x4 v8, v4, v20
 # CHECK-INST: sf.vqmacc.4x8x4 v8, v4, v20
 # CHECK-ENCODING: [0x5b,0x24,0x42,0xf7]
 # CHECK-ERROR: instruction requires the following: 'XSfvqmaccqoq' (SiFive Int8 Matrix Multiplication Instructions (4-by-8 and 8-by-4))
-# CHECK-UNKNOWN: 5b 24 42 f7 <unknown>
+# CHECK-UNKNOWN: f742245b <unknown>
 
 sf.vqmaccus.4x8x4 v8, v4, v20
 # CHECK-INST: sf.vqmaccus.4x8x4 v8, v4, v20
 # CHECK-ENCODING: [0x5b,0x24,0x42,0xfb]
 # CHECK-ERROR: instruction requires the following: 'XSfvqmaccqoq' (SiFive Int8 Matrix Multiplication Instructions (4-by-8 and 8-by-4))
-# CHECK-UNKNOWN: 5b 24 42 fb <unknown>
+# CHECK-UNKNOWN: fb42245b <unknown>
 
 sf.vqmaccsu.4x8x4 v8, v4, v20
 # CHECK-INST: sf.vqmaccsu.4x8x4 v8, v4, v20
 # CHECK-ENCODING: [0x5b,0x24,0x42,0xff]
 # CHECK-ERROR: instruction requires the following: 'XSfvqmaccqoq' (SiFive Int8 Matrix Multiplication Instructions (4-by-8 and 8-by-4))
-# CHECK-UNKNOWN: 5b 24 42 ff <unknown>
+# CHECK-UNKNOWN: ff42245b <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/zvbb.s b/llvm/test/MC/RISCV/rvv/zvbb.s
index 04e5ad6e03f7..d9d1f6f42d32 100644
--- a/llvm/test/MC/RISCV/rvv/zvbb.s
+++ b/llvm/test/MC/RISCV/rvv/zvbb.s
@@ -12,40 +12,40 @@ vbrev.v v10, v9, v0.t
 # CHECK-INST: vbrev.v v10, v9, v0.t
 # CHECK-ENCODING: [0x57,0x25,0x95,0x48]
 # CHECK-ERROR: instruction requires the following: 'Zvbb' (Vector basic bit-manipulation instructions){{$}}
-# CHECK-UNKNOWN: 57 25 95 48   <unknown>
+# CHECK-UNKNOWN: 48952557 <unknown>
 
 vclz.v v10, v9, v0.t
 # CHECK-INST: vclz.v v10, v9, v0.t
 # CHECK-ENCODING: [0x57,0x25,0x96,0x48]
 # CHECK-ERROR: instruction requires the following: 'Zvbb' (Vector basic bit-manipulation instructions){{$}}
-# CHECK-UNKNOWN: 57 25 96 48   <unknown>
+# CHECK-UNKNOWN: 48962557 <unknown>
 
 vcpop.v v10, v9, v0.t
 # CHECK-INST: vcpop.v v10, v9, v0.t
 # CHECK-ENCODING: [0x57,0x25,0x97,0x48]
 # CHECK-ERROR: instruction requires the following: 'Zvbb' (Vector basic bit-manipulation instructions){{$}}
-# CHECK-UNKNOWN: 57 25 97 48   <unknown>
+# CHECK-UNKNOWN: 48972557 <unknown>
 
 vctz.v v10, v9, v0.t
 # CHECK-INST: vctz.v v10, v9, v0.t
 # CHECK-ENCODING: [0x57,0xa5,0x96,0x48]
 # CHECK-ERROR: instruction requires the following: 'Zvbb' (Vector basic bit-manipulation instructions){{$}}
-# CHECK-UNKNOWN: 57 a5 96 48   <unknown>
+# CHECK-UNKNOWN: 4896a557 <unknown>
 
 vwsll.vv v10, v9, v8, v0.t
 # CHECK-INST: vwsll.vv v10, v9, v8, v0.t
 # CHECK-ENCODING: [0x57,0x05,0x94,0xd4]
 # CHECK-ERROR: instruction requires the following: 'Zvbb' (Vector basic bit-manipulation instructions){{$}}
-# CHECK-UNKNOWN: 57 05 94 d4   <unknown>
+# CHECK-UNKNOWN: d4940557 <unknown>
 
 vwsll.vx v10, v9, a0, v0.t
 # CHECK-INST: vwsll.vx v10, v9, a0, v0.t
 # CHECK-ENCODING: [0x57,0x45,0x95,0xd4]
 # CHECK-ERROR: instruction requires the following: 'Zvbb' (Vector basic bit-manipulation instructions){{$}}
-# CHECK-UNKNOWN: 57 45 95 d4   <unknown>
+# CHECK-UNKNOWN: d4954557 <unknown>
 
 vwsll.vi v10, v9, 29, v0.t
 # CHECK-INST: vwsll.vi v10, v9, 29, v0.t
 # CHECK-ENCODING: [0x57,0xb5,0x9e,0xd4]
 # CHECK-ERROR: instruction requires the following: 'Zvbb' (Vector basic bit-manipulation instructions){{$}}
-# CHECK-UNKNOWN: 57 b5 9e d4   <unknown>
+# CHECK-UNKNOWN: d49eb557 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/zvbc.s b/llvm/test/MC/RISCV/rvv/zvbc.s
index b32349a2db13..0eb02d153b79 100644
--- a/llvm/test/MC/RISCV/rvv/zvbc.s
+++ b/llvm/test/MC/RISCV/rvv/zvbc.s
@@ -12,22 +12,22 @@ vclmul.vv v10, v9, v8
 # CHECK-INST: vclmul.vv v10, v9, v8
 # CHECK-ENCODING: [0x57,0x25,0x94,0x32]
 # CHECK-ERROR: instruction requires the following: 'Zvbc' (Vector Carryless Multiplication){{$}}
-# CHECK-UNKNOWN: 57 25 94 32   <unknown>
+# CHECK-UNKNOWN: 32942557 <unknown>
 
 vclmul.vx v10, v9, a0
 # CHECK-INST: vclmul.vx v10, v9, a0
 # CHECK-ENCODING: [0x57,0x65,0x95,0x32]
 # CHECK-ERROR: instruction requires the following: 'Zvbc' (Vector Carryless Multiplication){{$}}
-# CHECK-UNKNOWN: 57 65 95 32   <unknown>
+# CHECK-UNKNOWN: 32956557 <unknown>
 
 vclmulh.vv v10, v9, v8
 # CHECK-INST: vclmulh.vv v10, v9, v8
 # CHECK-ENCODING: [0x57,0x25,0x94,0x36]
 # CHECK-ERROR: instruction requires the following: 'Zvbc' (Vector Carryless Multiplication){{$}}
-# CHECK-UNKNOWN: 57 25 94 36   <unknown>
+# CHECK-UNKNOWN: 36942557 <unknown>
 
 vclmulh.vx v10, v9, a0
 # CHECK-INST: vclmulh.vx v10, v9, a0
 # CHECK-ENCODING: [0x57,0x65,0x95,0x36]
 # CHECK-ERROR: instruction requires the following: 'Zvbc' (Vector Carryless Multiplication){{$}}
-# CHECK-UNKNOWN: 57 65 95 36   <unknown>
+# CHECK-UNKNOWN: 36956557 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/zvfbfmin.s b/llvm/test/MC/RISCV/rvv/zvfbfmin.s
index 1cbe027ef26c..7965c2482b00 100644
--- a/llvm/test/MC/RISCV/rvv/zvfbfmin.s
+++ b/llvm/test/MC/RISCV/rvv/zvfbfmin.s
@@ -20,23 +20,23 @@
 # CHECK-INST: vfncvtbf16.f.f.w v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x4e,0x48]
 # CHECK-ERROR: instruction requires the following: 'Zvfbfmin' (Vector BF16 Converts){{$}}
-# CHECK-UNKNOWN: 57 94 4e 48 <unknown>
+# CHECK-UNKNOWN: 484e9457 <unknown>
 vfncvtbf16.f.f.w v8, v4, v0.t
 
 # CHECK-INST: vfncvtbf16.f.f.w v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x4e,0x4a]
 # CHECK-ERROR: instruction requires the following: 'Zvfbfmin' (Vector BF16 Converts){{$}}
-# CHECK-UNKNOWN: 57 94 4e 4a <unknown>
+# CHECK-UNKNOWN: 4a4e9457 <unknown>
 vfncvtbf16.f.f.w v8, v4
 
 # CHECK-INST: vfwcvtbf16.f.f.v v8, v4, v0.t
 # CHECK-ENCODING: [0x57,0x94,0x46,0x48]
 # CHECK-ERROR: instruction requires the following: 'Zvfbfmin' (Vector BF16 Converts){{$}}
-# CHECK-UNKNOWN: 57 94 46 48 <unknown>
+# CHECK-UNKNOWN: 48469457 <unknown>
 vfwcvtbf16.f.f.v v8, v4, v0.t
 
 # CHECK-INST: vfwcvtbf16.f.f.v v8, v4
 # CHECK-ENCODING: [0x57,0x94,0x46,0x4a]
 # CHECK-ERROR: instruction requires the following: 'Zvfbfmin' (Vector BF16 Converts){{$}}
-# CHECK-UNKNOWN: 57 94 46 4a <unknown>
+# CHECK-UNKNOWN: 4a469457 <unknown>
 vfwcvtbf16.f.f.v v8, v4
diff --git a/llvm/test/MC/RISCV/rvv/zvfbfwma.s b/llvm/test/MC/RISCV/rvv/zvfbfwma.s
index 5a30d9f19ab6..330dee58d836 100644
--- a/llvm/test/MC/RISCV/rvv/zvfbfwma.s
+++ b/llvm/test/MC/RISCV/rvv/zvfbfwma.s
@@ -20,25 +20,25 @@
 # CHECK-INST: vfwmaccbf16.vv v8, v20, v4, v0.t
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xec]
 # CHECK-ERROR: instruction requires the following: 'Zvfbfwma' (Vector BF16 widening mul-add){{$}}
-# CHECK-UNKNOWN: 57 14 4a ec <unknown>
+# CHECK-UNKNOWN: ec4a1457 <unknown>
 vfwmaccbf16.vv v8, v20, v4, v0.t
 
 # CHECK-INST: vfwmaccbf16.vv v8, v20, v4
 # CHECK-ENCODING: [0x57,0x14,0x4a,0xee]
 # CHECK-ERROR: instruction requires the following: 'Zvfbfwma' (Vector BF16 widening mul-add){{$}}
-# CHECK-UNKNOWN: 57 14 4a ee <unknown>
+# CHECK-UNKNOWN: ee4a1457 <unknown>
 vfwmaccbf16.vv v8, v20, v4
 
 # CHECK-INST: vfwmaccbf16.vf v8, fa0, v4, v0.t
 # CHECK-ENCODING: [0x57,0x54,0x45,0xec]
 # CHECK-ERROR: instruction requires the following: 'Zvfbfwma' (Vector BF16 widening mul-add){{$}}
-# CHECK-UNKNOWN: 57 54 45 ec <unknown>
+# CHECK-UNKNOWN: ec455457 <unknown>
 vfwmaccbf16.vf v8, fa0, v4, v0.t
 
 # CHECK-INST: vfwmaccbf16.vf v8, fa0, v4
 # CHECK-ENCODING: [0x57,0x54,0x45,0xee]
 # CHECK-ERROR: instruction requires the following: 'Zvfbfwma' (Vector BF16 widening mul-add){{$}}
-# CHECK-UNKNOWN: 57 54 45 ee <unknown>
+# CHECK-UNKNOWN: ee455457 <unknown>
 vfwmaccbf16.vf v8, fa0, v4
 
 # Check scalar half FP load/store/move included in this extension.
@@ -46,23 +46,23 @@ vfwmaccbf16.vf v8, fa0, v4
 # CHECK-INST: flh ft0, 12(a0)
 # CHECK-ENCODING: [0x07,0x10,0xc5,0x00]
 # CHECK-ERROR: instruction requires the following: 'Zfh' (Half-Precision Floating-Point) or 'Zfhmin' (Half-Precision Floating-Point Minimal) or 'Zfbfmin' (Scalar BF16 Converts){{$}}
-# CHECK-UNKNOWN: 07 10 c5 00 <unknown>
+# CHECK-UNKNOWN: 00c51007 <unknown>
 flh f0, 12(a0)
 
 # CHECK-INST: fsh ft6, 2047(s4)
 # CHECK-ENCODING: [0xa7,0x1f,0x6a,0x7e]
 # CHECK-ERROR: instruction requires the following: 'Zfh' (Half-Precision Floating-Point) or 'Zfhmin' (Half-Precision Floating-Point Minimal) or 'Zfbfmin' (Scalar BF16 Converts){{$}}
-# CHECK-UNKNOWN: a7 1f 6a 7e <unknown>
+# CHECK-UNKNOWN: 7e6a1fa7 <unknown>
 fsh f6, 2047(s4)
 
 # CHECK-INST: fmv.x.h a2, fs7
 # CHECK-ENCODING: [0x53,0x86,0x0b,0xe4]
 # CHECK-ERROR: instruction requires the following: 'Zfh' (Half-Precision Floating-Point) or 'Zfhmin' (Half-Precision Floating-Point Minimal) or 'Zfbfmin' (Scalar BF16 Converts){{$}}
-# CHECK-UNKNOWN: 53 86 0b e4 <unknown>
+# CHECK-UNKNOWN: e40b8653 <unknown>
 fmv.x.h a2, fs7
 
 # CHECK-INST: fmv.h.x ft1, a6
 # CHECK-ENCODING: [0xd3,0x00,0x08,0xf4]
 # CHECK-ERROR: instruction requires the following: 'Zfh' (Half-Precision Floating-Point) or 'Zfhmin' (Half-Precision Floating-Point Minimal) or 'Zfbfmin' (Scalar BF16 Converts){{$}}
-# CHECK-UNKNOWN: d3 00 08 f4 <unknown>
+# CHECK-UNKNOWN: f40800d3 <unknown>
 fmv.h.x ft1, a6
diff --git a/llvm/test/MC/RISCV/rvv/zvkb.s b/llvm/test/MC/RISCV/rvv/zvkb.s
index ae2dec18d33c..1833ba860c90 100644
--- a/llvm/test/MC/RISCV/rvv/zvkb.s
+++ b/llvm/test/MC/RISCV/rvv/zvkb.s
@@ -12,52 +12,52 @@ vandn.vv v10, v9, v8, v0.t
 # CHECK-INST: vandn.vv v10, v9, v8, v0.t
 # CHECK-ENCODING: [0x57,0x05,0x94,0x04]
 # CHECK-ERROR: instruction requires the following: 'Zvkb' (Vector Bit-manipulation used in Cryptography){{$}}
-# CHECK-UNKNOWN: 57 05 94 04   <unknown>
+# CHECK-UNKNOWN: 04940557 <unknown>
 
 vandn.vx v10, v9, a0, v0.t
 # CHECK-INST: vandn.vx v10, v9, a0, v0.t
 # CHECK-ENCODING: [0x57,0x45,0x95,0x04]
 # CHECK-ERROR: instruction requires the following: 'Zvkb' (Vector Bit-manipulation used in Cryptography){{$}}
-# CHECK-UNKNOWN: 57 45 95 04   <unknown>
+# CHECK-UNKNOWN: 04954557 <unknown>
 
 vbrev8.v v10, v9, v0.t
 # CHECK-INST: vbrev8.v v10, v9, v0.t
 # CHECK-ENCODING: [0x57,0x25,0x94,0x48]
 # CHECK-ERROR: instruction requires the following: 'Zvkb' (Vector Bit-manipulation used in Cryptography){{$}}
-# CHECK-UNKNOWN: 57 25 94 48   <unknown>
+# CHECK-UNKNOWN: 48942557 <unknown>
 
 vrev8.v v10, v9, v0.t
 # CHECK-INST: vrev8.v v10, v9, v0.t
 # CHECK-ENCODING: [0x57,0xa5,0x94,0x48]
 # CHECK-ERROR: instruction requires the following: 'Zvkb' (Vector Bit-manipulation used in Cryptography){{$}}
-# CHECK-UNKNOWN: 57 a5 94 48   <unknown>
+# CHECK-UNKNOWN: 4894a557 <unknown>
 
 vrol.vv v10, v9, v8, v0.t
 # CHECK-INST: vrol.vv v10, v9, v8, v0.t
 # CHECK-ENCODING: [0x57,0x05,0x94,0x54]
 # CHECK-ERROR: instruction requires the following: 'Zvkb' (Vector Bit-manipulation used in Cryptography){{$}}
-# CHECK-UNKNOWN: 57 05 94 54   <unknown>
+# CHECK-UNKNOWN: 54940557 <unknown>
 
 vrol.vx v10, v9, a0, v0.t
 # CHECK-INST: vrol.vx v10, v9, a0, v0.t
 # CHECK-ENCODING: [0x57,0x45,0x95,0x54]
 # CHECK-ERROR: instruction requires the following: 'Zvkb' (Vector Bit-manipulation used in Cryptography){{$}}
-# CHECK-UNKNOWN: 57 45 95 54   <unknown>
+# CHECK-UNKNOWN: 54954557 <unknown>
 
 vror.vv v10, v9, v8, v0.t
 # CHECK-INST: vror.vv v10, v9, v8, v0.t
 # CHECK-ENCODING: [0x57,0x05,0x94,0x50]
 # CHECK-ERROR: instruction requires the following: 'Zvkb' (Vector Bit-manipulation used in Cryptography){{$}}
-# CHECK-UNKNOWN: 57 05 94 50   <unknown>
+# CHECK-UNKNOWN: 50940557 <unknown>
 
 vror.vx v10, v9, a0, v0.t
 # CHECK-INST: vror.vx v10, v9, a0, v0.t
 # CHECK-ENCODING: [0x57,0x45,0x95,0x50]
 # CHECK-ERROR: instruction requires the following: 'Zvkb' (Vector Bit-manipulation used in Cryptography){{$}}
-# CHECK-UNKNOWN: 57 45 95 50   <unknown>
+# CHECK-UNKNOWN: 50954557 <unknown>
 
 vror.vi v10, v9, 33, v0.t
 # CHECK-INST: vror.vi v10, v9, 33, v0.t
 # CHECK-ENCODING: [0x57,0xb5,0x90,0x54]
 # CHECK-ERROR: instruction requires the following: 'Zvkb' (Vector Bit-manipulation used in Cryptography){{$}}
-# CHECK-UNKNOWN: 57 b5 90 54   <unknown>
+# CHECK-UNKNOWN: 5490b557 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/zvkg.s b/llvm/test/MC/RISCV/rvv/zvkg.s
index f2016bc116b6..48b84659e0ae 100644
--- a/llvm/test/MC/RISCV/rvv/zvkg.s
+++ b/llvm/test/MC/RISCV/rvv/zvkg.s
@@ -12,10 +12,10 @@ vghsh.vv v10, v9, v8
 # CHECK-INST: vghsh.vv v10, v9, v8
 # CHECK-ENCODING: [0x77,0x25,0x94,0xb2]
 # CHECK-ERROR: instruction requires the following: 'Zvkg' (Vector GCM instructions for Cryptography){{$}}
-# CHECK-UNKNOWN: 77 25 94 b2   <unknown>
+# CHECK-UNKNOWN: b2942577 <unknown>
 
 vgmul.vv v10, v9
 # CHECK-INST: vgmul.vv v10, v9
 # CHECK-ENCODING: [0x77,0xa5,0x98,0xa2]
 # CHECK-ERROR: instruction requires the following: 'Zvkg' (Vector GCM instructions for Cryptography){{$}}
-# CHECK-UNKNOWN: 77 a5 98 a2   <unknown>
+# CHECK-UNKNOWN: a298a577 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/zvkned.s b/llvm/test/MC/RISCV/rvv/zvkned.s
index e51a9cc562f1..bee3d74ee88d 100644
--- a/llvm/test/MC/RISCV/rvv/zvkned.s
+++ b/llvm/test/MC/RISCV/rvv/zvkned.s
@@ -12,76 +12,76 @@ vaesdf.vv v10, v9
 # CHECK-INST: vaesdf.vv v10, v9
 # CHECK-ENCODING: [0x77,0xa5,0x90,0xa2]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 a5 90 a2   <unknown>
+# CHECK-UNKNOWN: a290a577 <unknown>
 
 vaesdf.vs v10, v9
 # CHECK-INST: vaesdf.vs v10, v9
 # CHECK-ENCODING: [0x77,0xa5,0x90,0xa6]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 a5 90 a6   <unknown>
+# CHECK-UNKNOWN: a690a577 <unknown>
 
 vaesef.vv v10, v9
 # CHECK-INST: vaesef.vv v10, v9
 # CHECK-ENCODING: [0x77,0xa5,0x91,0xa2]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 a5 91 a2   <unknown>
+# CHECK-UNKNOWN: a291a577 <unknown>
                        
 vaesef.vs v10, v9
 # CHECK-INST: vaesef.vs v10, v9
 # CHECK-ENCODING: [0x77,0xa5,0x91,0xa6]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 a5 91 a6   <unknown>
+# CHECK-UNKNOWN: a691a577 <unknown>
 
 vaesdm.vv v10, v9
 # CHECK-INST: vaesdm.vv v10, v9
 # CHECK-ENCODING: [0x77,0x25,0x90,0xa2]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 25 90 a2   <unknown>
+# CHECK-UNKNOWN: a2902577 <unknown>
                        
 vaesdm.vs v10, v9
 # CHECK-INST: vaesdm.vs v10, v9
 # CHECK-ENCODING: [0x77,0x25,0x90,0xa6]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 25 90 a6   <unknown>
+# CHECK-UNKNOWN: a6902577 <unknown>
 
 vaesem.vv v10, v9
 # CHECK-INST: vaesem.vv v10, v9
 # CHECK-ENCODING: [0x77,0x25,0x91,0xa2]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 25 91 a2   <unknown>
+# CHECK-UNKNOWN: a2912577 <unknown>
                        
 vaesem.vs v10, v9
 # CHECK-INST: vaesem.vs v10, v9
 # CHECK-ENCODING: [0x77,0x25,0x91,0xa6]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 25 91 a6   <unknown>
+# CHECK-UNKNOWN: a6912577 <unknown>
 
 vaeskf1.vi v10, v9, 1
 # CHECK-INST: vaeskf1.vi v10, v9, 1
 # CHECK-ENCODING: [0x77,0xa5,0x90,0x8a]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 a5 90 8a   <unknown>
+# CHECK-UNKNOWN: 8a90a577 <unknown>
 
 vaeskf1.vi v10, v9, 31
 # CHECK-INST: vaeskf1.vi v10, v9, 31
 # CHECK-ENCODING: [0x77,0xa5,0x9f,0x8a]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 a5 9f 8a   <unknown>
+# CHECK-UNKNOWN: 8a9fa577 <unknown>
 
 vaeskf2.vi v10, v9, 2
 # CHECK-INST: vaeskf2.vi v10, v9, 2
 # CHECK-ENCODING: [0x77,0x25,0x91,0xaa]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 25 91 aa   <unknown>
+# CHECK-UNKNOWN: aa912577 <unknown>
 
 vaeskf2.vi v10, v9, 31
 # CHECK-INST: vaeskf2.vi v10, v9, 31
 # CHECK-ENCODING: [0x77,0xa5,0x9f,0xaa]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 a5 9f aa   <unknown>
+# CHECK-UNKNOWN: aa9fa577 <unknown>
 
 vaesz.vs v10, v9
 # CHECK-INST: vaesz.vs v10, v9
 # CHECK-ENCODING: [0x77,0xa5,0x93,0xa6]
 # CHECK-ERROR: instruction requires the following: 'Zvkned' (Vector AES Encryption & Decryption (Single Round)){{$}}
-# CHECK-UNKNOWN: 77 a5 93 a6   <unknown>
+# CHECK-UNKNOWN: a693a577 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/zvknh.s b/llvm/test/MC/RISCV/rvv/zvknh.s
index aa8033a5f217..b16b9081f7e6 100644
--- a/llvm/test/MC/RISCV/rvv/zvknh.s
+++ b/llvm/test/MC/RISCV/rvv/zvknh.s
@@ -18,17 +18,17 @@
 vsha2ms.vv v10, v9, v8
 # CHECK-INST: vsha2ms.vv v10, v9, v8
 # CHECK-ENCODING: [0x77,0x25,0x94,0xb6]
-# CHECK-UNKNOWN: 77 25 94 b6   <unknown>
+# CHECK-UNKNOWN: b6942577 <unknown>
 # CHECK-ERROR: instruction requires the following: 'Zvknha' or 'Zvknhb' (Vector SHA-2){{$}}
 
 vsha2ch.vv v10, v9, v8
 # CHECK-INST: vsha2ch.vv v10, v9, v8
 # CHECK-ENCODING: [0x77,0x25,0x94,0xba]
-# CHECK-UNKNOWN: 77 25 94 ba   <unknown>
+# CHECK-UNKNOWN: ba942577 <unknown>
 # CHECK-ERROR: instruction requires the following: 'Zvknha' or 'Zvknhb' (Vector SHA-2){{$}}
 
 vsha2cl.vv v10, v9, v8
 # CHECK-INST: vsha2cl.vv v10, v9, v8
 # CHECK-ENCODING: [0x77,0x25,0x94,0xbe]
-# CHECK-UNKNOWN: 77 25 94 be   <unknown>
+# CHECK-UNKNOWN: be942577 <unknown>
 # CHECK-ERROR: instruction requires the following: 'Zvknha' or 'Zvknhb' (Vector SHA-2){{$}}
diff --git a/llvm/test/MC/RISCV/rvv/zvksed.s b/llvm/test/MC/RISCV/rvv/zvksed.s
index 87c9713f8c65..f7a0949272ff 100644
--- a/llvm/test/MC/RISCV/rvv/zvksed.s
+++ b/llvm/test/MC/RISCV/rvv/zvksed.s
@@ -12,22 +12,22 @@ vsm4k.vi v10, v9, 7
 # CHECK-INST: vsm4k.vi v10, v9, 7
 # CHECK-ENCODING: [0x77,0xa5,0x93,0x86]
 # CHECK-ERROR: instruction requires the following: 'Zvksed' (SM4 Block Cipher Instructions){{$}}
-# CHECK-UNKNOWN: 77 a5 93 86   <unknown>
+# CHECK-UNKNOWN: 8693a577 <unknown>
 
 vsm4k.vi v10, v9, 31
 # CHECK-INST: vsm4k.vi v10, v9, 31
 # CHECK-ENCODING: [0x77,0xa5,0x9f,0x86]
 # CHECK-ERROR: instruction requires the following: 'Zvksed' (SM4 Block Cipher Instructions){{$}}
-# CHECK-UNKNOWN: 77 a5 9f 86   <unknown>
+# CHECK-UNKNOWN: 869fa577 <unknown>
 
 vsm4r.vv v10, v9
 # CHECK-INST: vsm4r.vv v10, v9
 # CHECK-ENCODING: [0x77,0x25,0x98,0xa2]
 # CHECK-ERROR: instruction requires the following: 'Zvksed' (SM4 Block Cipher Instructions){{$}}
-# CHECK-UNKNOWN: 77 25 98 a2   <unknown>
+# CHECK-UNKNOWN: a2982577 <unknown>
 
 vsm4r.vs v10, v9
 # CHECK-INST: vsm4r.vs v10, v9
 # CHECK-ENCODING: [0x77,0x25,0x98,0xa6]
 # CHECK-ERROR: instruction requires the following: 'Zvksed' (SM4 Block Cipher Instructions){{$}}
-# CHECK-UNKNOWN: 77 25 98 a6   <unknown>
+# CHECK-UNKNOWN: a6982577 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/zvksh.s b/llvm/test/MC/RISCV/rvv/zvksh.s
index 06251ff6efe5..ef1c654b4605 100644
--- a/llvm/test/MC/RISCV/rvv/zvksh.s
+++ b/llvm/test/MC/RISCV/rvv/zvksh.s
@@ -12,17 +12,17 @@ vsm3c.vi v10, v9, 7
 # CHECK-INST: vsm3c.vi v10, v9, 7
 # CHECK-ENCODING: [0x77,0xa5,0x93,0xae]
 # CHECK-ERROR: instruction requires the following: 'Zvksh' (SM3 Hash Function Instructions){{$}}
-# CHECK-UNKNOWN: 77 a5 93 ae   <unknown>
+# CHECK-UNKNOWN: ae93a577 <unknown>
 
 vsm3me.vv v10, v9, v8
 # CHECK-INST: vsm3me.vv v10, v9, v8
 # CHECK-ENCODING: [0x77,0x25,0x94,0x82]
 # CHECK-ERROR: instruction requires the following: 'Zvksh' (SM3 Hash Function Instructions){{$}}
-# CHECK-UNKNOWN: 77 25 94 82   <unknown>
+# CHECK-UNKNOWN: 82942577 <unknown>
 
 # vs1 is allowed to overlap, but not vs2.
 vsm3me.vv v10, v9, v10
 # CHECK-INST: vsm3me.vv v10, v9, v10
 # CHECK-ENCODING: [0x77,0x25,0x95,0x82]
 # CHECK-ERROR: instruction requires the following: 'Zvksh' (SM3 Hash Function Instructions){{$}}
-# CHECK-UNKNOWN: 77 25 95 82   <unknown>
+# CHECK-UNKNOWN: 82952577 <unknown>
diff --git a/llvm/test/MC/RISCV/rvv/zvlsseg.s b/llvm/test/MC/RISCV/rvv/zvlsseg.s
index 9a83ea9f8721..65089e2261be 100644
--- a/llvm/test/MC/RISCV/rvv/zvlsseg.s
+++ b/llvm/test/MC/RISCV/rvv/zvlsseg.s
@@ -13,3076 +13,3076 @@ vlseg2e8.v v8, (a0), v0.t
 # CHECK-INST: vlseg2e8.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 20 <unknown>
+# CHECK-UNKNOWN: 20050407 <unknown>
 
 vlseg2e8.v v8, (a0)
 # CHECK-INST: vlseg2e8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 22 <unknown>
+# CHECK-UNKNOWN: 22050407 <unknown>
 
 vlseg2e16.v v8, (a0), v0.t
 # CHECK-INST: vlseg2e16.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 20 <unknown>
+# CHECK-UNKNOWN: 20055407 <unknown>
 
 vlseg2e16.v v8, (a0)
 # CHECK-INST: vlseg2e16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 22 <unknown>
+# CHECK-UNKNOWN: 22055407 <unknown>
 
 vlseg2e32.v v8, (a0), v0.t
 # CHECK-INST: vlseg2e32.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 20 <unknown>
+# CHECK-UNKNOWN: 20056407 <unknown>
 
 vlseg2e32.v v8, (a0)
 # CHECK-INST: vlseg2e32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 22 <unknown>
+# CHECK-UNKNOWN: 22056407 <unknown>
 
 vlseg2e64.v v8, (a0), v0.t
 # CHECK-INST: vlseg2e64.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 20 <unknown>
+# CHECK-UNKNOWN: 20057407 <unknown>
 
 vlseg2e64.v v8, (a0)
 # CHECK-INST: vlseg2e64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 22 <unknown>
+# CHECK-UNKNOWN: 22057407 <unknown>
 
 vlseg2e8ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg2e8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x21]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 21 <unknown>
+# CHECK-UNKNOWN: 21050407 <unknown>
 
 vlseg2e8ff.v v8, (a0)
 # CHECK-INST: vlseg2e8ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x23]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 23 <unknown>
+# CHECK-UNKNOWN: 23050407 <unknown>
 
 vlseg2e16ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg2e16ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0x21]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 21 <unknown>
+# CHECK-UNKNOWN: 21055407 <unknown>
 
 vlseg2e16ff.v v8, (a0)
 # CHECK-INST: vlseg2e16ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x23]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 23 <unknown>
+# CHECK-UNKNOWN: 23055407 <unknown>
 
 vlseg2e32ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg2e32ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0x21]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 21 <unknown>
+# CHECK-UNKNOWN: 21056407 <unknown>
 
 vlseg2e32ff.v v8, (a0)
 # CHECK-INST: vlseg2e32ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0x23]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 23 <unknown>
+# CHECK-UNKNOWN: 23056407 <unknown>
 
 vlseg2e64ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg2e64ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0x21]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 21 <unknown>
+# CHECK-UNKNOWN: 21057407 <unknown>
 
 vlseg2e64ff.v v8, (a0)
 # CHECK-INST: vlseg2e64ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0x23]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 23 <unknown>
+# CHECK-UNKNOWN: 23057407 <unknown>
 
 vlsseg2e8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg2e8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 28 <unknown>
+# CHECK-UNKNOWN: 28b50407 <unknown>
 
 vlsseg2e8.v v8, (a0), a1
 # CHECK-INST: vlsseg2e8.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 2a <unknown>
+# CHECK-UNKNOWN: 2ab50407 <unknown>
 
 vlsseg2e16.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg2e16.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 28 <unknown>
+# CHECK-UNKNOWN: 28b55407 <unknown>
 
 vlsseg2e16.v v8, (a0), a1
 # CHECK-INST: vlsseg2e16.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 2a <unknown>
+# CHECK-UNKNOWN: 2ab55407 <unknown>
 
 vlsseg2e32.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg2e32.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x64,0xb5,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 28 <unknown>
+# CHECK-UNKNOWN: 28b56407 <unknown>
 
 vlsseg2e32.v v8, (a0), a1
 # CHECK-INST: vlsseg2e32.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x64,0xb5,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 2a <unknown>
+# CHECK-UNKNOWN: 2ab56407 <unknown>
 
 vlsseg2e64.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg2e64.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x74,0xb5,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 28 <unknown>
+# CHECK-UNKNOWN: 28b57407 <unknown>
 
 vlsseg2e64.v v8, (a0), a1
 # CHECK-INST: vlsseg2e64.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x74,0xb5,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 2a <unknown>
+# CHECK-UNKNOWN: 2ab57407 <unknown>
 
 vluxseg2ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg2ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 24 <unknown>
+# CHECK-UNKNOWN: 24450407 <unknown>
 
 vluxseg2ei8.v v8, (a0), v4
 # CHECK-INST: vluxseg2ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 26 <unknown>
+# CHECK-UNKNOWN: 26450407 <unknown>
 
 vluxseg2ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg2ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 24 <unknown>
+# CHECK-UNKNOWN: 24455407 <unknown>
 
 vluxseg2ei16.v v8, (a0), v4
 # CHECK-INST: vluxseg2ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 26 <unknown>
+# CHECK-UNKNOWN: 26455407 <unknown>
 
 vluxseg2ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg2ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 24 <unknown>
+# CHECK-UNKNOWN: 24456407 <unknown>
 
 vluxseg2ei32.v v8, (a0), v4
 # CHECK-INST: vluxseg2ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 26 <unknown>
+# CHECK-UNKNOWN: 26456407 <unknown>
 
 vluxseg2ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg2ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 24 <unknown>
+# CHECK-UNKNOWN: 24457407 <unknown>
 
 vluxseg2ei64.v v8, (a0), v4
 # CHECK-INST: vluxseg2ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 26 <unknown>
+# CHECK-UNKNOWN: 26457407 <unknown>
 
 vloxseg2ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg2ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 2c <unknown>
+# CHECK-UNKNOWN: 2c450407 <unknown>
 
 vloxseg2ei8.v v8, (a0), v4
 # CHECK-INST: vloxseg2ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 2e <unknown>
+# CHECK-UNKNOWN: 2e450407 <unknown>
 
 vloxseg2ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg2ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 2c <unknown>
+# CHECK-UNKNOWN: 2c455407 <unknown>
 
 vloxseg2ei16.v v8, (a0), v4
 # CHECK-INST: vloxseg2ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 2e <unknown>
+# CHECK-UNKNOWN: 2e455407 <unknown>
 
 vloxseg2ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg2ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 2c <unknown>
+# CHECK-UNKNOWN: 2c456407 <unknown>
 
 vloxseg2ei32.v v8, (a0), v4
 # CHECK-INST: vloxseg2ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 2e <unknown>
+# CHECK-UNKNOWN: 2e456407 <unknown>
 
 vloxseg2ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg2ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 2c <unknown>
+# CHECK-UNKNOWN: 2c457407 <unknown>
 
 vloxseg2ei64.v v8, (a0), v4
 # CHECK-INST: vloxseg2ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 2e <unknown>
+# CHECK-UNKNOWN: 2e457407 <unknown>
 
 vlseg3e8.v v8, (a0), v0.t
 # CHECK-INST: vlseg3e8.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 40 <unknown>
+# CHECK-UNKNOWN: 40050407 <unknown>
 
 vlseg3e8.v v8, (a0)
 # CHECK-INST: vlseg3e8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 42 <unknown>
+# CHECK-UNKNOWN: 42050407 <unknown>
 
 vlseg3e16.v v8, (a0), v0.t
 # CHECK-INST: vlseg3e16.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 40 <unknown>
+# CHECK-UNKNOWN: 40055407 <unknown>
 
 vlseg3e16.v v8, (a0)
 # CHECK-INST: vlseg3e16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 42 <unknown>
+# CHECK-UNKNOWN: 42055407 <unknown>
 
 vlseg3e32.v v8, (a0), v0.t
 # CHECK-INST: vlseg3e32.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 40 <unknown>
+# CHECK-UNKNOWN: 40056407 <unknown>
 
 vlseg3e32.v v8, (a0)
 # CHECK-INST: vlseg3e32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 42 <unknown>
+# CHECK-UNKNOWN: 42056407 <unknown>
 
 vlseg3e64.v v8, (a0), v0.t
 # CHECK-INST: vlseg3e64.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 40 <unknown>
+# CHECK-UNKNOWN: 40057407 <unknown>
 
 vlseg3e64.v v8, (a0)
 # CHECK-INST: vlseg3e64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 42 <unknown>
+# CHECK-UNKNOWN: 42057407 <unknown>
 
 vlseg3e8ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg3e8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x41]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 41 <unknown>
+# CHECK-UNKNOWN: 41050407 <unknown>
 
 vlseg3e8ff.v v8, (a0)
 # CHECK-INST: vlseg3e8ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x43]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 43 <unknown>
+# CHECK-UNKNOWN: 43050407 <unknown>
 
 vlseg3e16ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg3e16ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0x41]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 41 <unknown>
+# CHECK-UNKNOWN: 41055407 <unknown>
 
 vlseg3e16ff.v v8, (a0)
 # CHECK-INST: vlseg3e16ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x43]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 43 <unknown>
+# CHECK-UNKNOWN: 43055407 <unknown>
 
 vlseg3e32ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg3e32ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0x41]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 41 <unknown>
+# CHECK-UNKNOWN: 41056407 <unknown>
 
 vlseg3e32ff.v v8, (a0)
 # CHECK-INST: vlseg3e32ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0x43]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 43 <unknown>
+# CHECK-UNKNOWN: 43056407 <unknown>
 
 vlseg3e64ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg3e64ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0x41]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 41 <unknown>
+# CHECK-UNKNOWN: 41057407 <unknown>
 
 vlseg3e64ff.v v8, (a0)
 # CHECK-INST: vlseg3e64ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0x43]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 43 <unknown>
+# CHECK-UNKNOWN: 43057407 <unknown>
 
 vlsseg3e8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg3e8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 48 <unknown>
+# CHECK-UNKNOWN: 48b50407 <unknown>
 
 vlsseg3e8.v v8, (a0), a1
 # CHECK-INST: vlsseg3e8.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 4a <unknown>
+# CHECK-UNKNOWN: 4ab50407 <unknown>
 
 vlsseg3e16.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg3e16.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 48 <unknown>
+# CHECK-UNKNOWN: 48b55407 <unknown>
 
 vlsseg3e16.v v8, (a0), a1
 # CHECK-INST: vlsseg3e16.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 4a <unknown>
+# CHECK-UNKNOWN: 4ab55407 <unknown>
 
 vlsseg3e32.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg3e32.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x64,0xb5,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 48 <unknown>
+# CHECK-UNKNOWN: 48b56407 <unknown>
 
 vlsseg3e32.v v8, (a0), a1
 # CHECK-INST: vlsseg3e32.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x64,0xb5,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 4a <unknown>
+# CHECK-UNKNOWN: 4ab56407 <unknown>
 
 vlsseg3e64.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg3e64.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x74,0xb5,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 48 <unknown>
+# CHECK-UNKNOWN: 48b57407 <unknown>
 
 vlsseg3e64.v v8, (a0), a1
 # CHECK-INST: vlsseg3e64.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x74,0xb5,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 4a <unknown>
+# CHECK-UNKNOWN: 4ab57407 <unknown>
 
 vluxseg3ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg3ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 44 <unknown>
+# CHECK-UNKNOWN: 44450407 <unknown>
 
 vluxseg3ei8.v v8, (a0), v4
 # CHECK-INST: vluxseg3ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 46 <unknown>
+# CHECK-UNKNOWN: 46450407 <unknown>
 
 vluxseg3ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg3ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 44 <unknown>
+# CHECK-UNKNOWN: 44455407 <unknown>
 
 vluxseg3ei16.v v8, (a0), v4
 # CHECK-INST: vluxseg3ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 46 <unknown>
+# CHECK-UNKNOWN: 46455407 <unknown>
 
 vluxseg3ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg3ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 44 <unknown>
+# CHECK-UNKNOWN: 44456407 <unknown>
 
 vluxseg3ei32.v v8, (a0), v4
 # CHECK-INST: vluxseg3ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 46 <unknown>
+# CHECK-UNKNOWN: 46456407 <unknown>
 
 vluxseg3ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg3ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 44 <unknown>
+# CHECK-UNKNOWN: 44457407 <unknown>
 
 vluxseg3ei64.v v8, (a0), v4
 # CHECK-INST: vluxseg3ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 46 <unknown>
+# CHECK-UNKNOWN: 46457407 <unknown>
 
 vloxseg3ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg3ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 4c <unknown>
+# CHECK-UNKNOWN: 4c450407 <unknown>
 
 vloxseg3ei8.v v8, (a0), v4
 # CHECK-INST: vloxseg3ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 4e <unknown>
+# CHECK-UNKNOWN: 4e450407 <unknown>
 
 vloxseg3ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg3ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 4c <unknown>
+# CHECK-UNKNOWN: 4c455407 <unknown>
 
 vloxseg3ei16.v v8, (a0), v4
 # CHECK-INST: vloxseg3ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 4e <unknown>
+# CHECK-UNKNOWN: 4e455407 <unknown>
 
 vloxseg3ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg3ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 4c <unknown>
+# CHECK-UNKNOWN: 4c456407 <unknown>
 
 vloxseg3ei32.v v8, (a0), v4
 # CHECK-INST: vloxseg3ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 4e <unknown>
+# CHECK-UNKNOWN: 4e456407 <unknown>
 
 vloxseg3ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg3ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 4c <unknown>
+# CHECK-UNKNOWN: 4c457407 <unknown>
 
 vloxseg3ei64.v v8, (a0), v4
 # CHECK-INST: vloxseg3ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 4e <unknown>
+# CHECK-UNKNOWN: 4e457407 <unknown>
 
 vlseg4e8.v v8, (a0), v0.t
 # CHECK-INST: vlseg4e8.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 60 <unknown>
+# CHECK-UNKNOWN: 60050407 <unknown>
 
 vlseg4e8.v v8, (a0)
 # CHECK-INST: vlseg4e8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 62 <unknown>
+# CHECK-UNKNOWN: 62050407 <unknown>
 
 vlseg4e16.v v8, (a0), v0.t
 # CHECK-INST: vlseg4e16.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 60 <unknown>
+# CHECK-UNKNOWN: 60055407 <unknown>
 
 vlseg4e16.v v8, (a0)
 # CHECK-INST: vlseg4e16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 62 <unknown>
+# CHECK-UNKNOWN: 62055407 <unknown>
 
 vlseg4e32.v v8, (a0), v0.t
 # CHECK-INST: vlseg4e32.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 60 <unknown>
+# CHECK-UNKNOWN: 60056407 <unknown>
 
 vlseg4e32.v v8, (a0)
 # CHECK-INST: vlseg4e32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 62 <unknown>
+# CHECK-UNKNOWN: 62056407 <unknown>
 
 vlseg4e64.v v8, (a0), v0.t
 # CHECK-INST: vlseg4e64.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 60 <unknown>
+# CHECK-UNKNOWN: 60057407 <unknown>
 
 vlseg4e64.v v8, (a0)
 # CHECK-INST: vlseg4e64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 62 <unknown>
+# CHECK-UNKNOWN: 62057407 <unknown>
 
 vlseg4e8ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg4e8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x61]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 61 <unknown>
+# CHECK-UNKNOWN: 61050407 <unknown>
 
 vlseg4e8ff.v v8, (a0)
 # CHECK-INST: vlseg4e8ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x63]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 63 <unknown>
+# CHECK-UNKNOWN: 63050407 <unknown>
 
 vlseg4e16ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg4e16ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0x61]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 61 <unknown>
+# CHECK-UNKNOWN: 61055407 <unknown>
 
 vlseg4e16ff.v v8, (a0)
 # CHECK-INST: vlseg4e16ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x63]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 63 <unknown>
+# CHECK-UNKNOWN: 63055407 <unknown>
 
 vlseg4e32ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg4e32ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0x61]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 61 <unknown>
+# CHECK-UNKNOWN: 61056407 <unknown>
 
 vlseg4e32ff.v v8, (a0)
 # CHECK-INST: vlseg4e32ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0x63]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 63 <unknown>
+# CHECK-UNKNOWN: 63056407 <unknown>
 
 vlseg4e64ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg4e64ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0x61]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 61 <unknown>
+# CHECK-UNKNOWN: 61057407 <unknown>
 
 vlseg4e64ff.v v8, (a0)
 # CHECK-INST: vlseg4e64ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0x63]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 63 <unknown>
+# CHECK-UNKNOWN: 63057407 <unknown>
 
 vlsseg4e8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg4e8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 68 <unknown>
+# CHECK-UNKNOWN: 68b50407 <unknown>
 
 vlsseg4e8.v v8, (a0), a1
 # CHECK-INST: vlsseg4e8.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 6a <unknown>
+# CHECK-UNKNOWN: 6ab50407 <unknown>
 
 vlsseg4e16.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg4e16.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 68 <unknown>
+# CHECK-UNKNOWN: 68b55407 <unknown>
 
 vlsseg4e16.v v8, (a0), a1
 # CHECK-INST: vlsseg4e16.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 6a <unknown>
+# CHECK-UNKNOWN: 6ab55407 <unknown>
 
 vlsseg4e32.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg4e32.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x64,0xb5,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 68 <unknown>
+# CHECK-UNKNOWN: 68b56407 <unknown>
 
 vlsseg4e32.v v8, (a0), a1
 # CHECK-INST: vlsseg4e32.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x64,0xb5,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 6a <unknown>
+# CHECK-UNKNOWN: 6ab56407 <unknown>
 
 vlsseg4e64.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg4e64.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x74,0xb5,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 68 <unknown>
+# CHECK-UNKNOWN: 68b57407 <unknown>
 
 vlsseg4e64.v v8, (a0), a1
 # CHECK-INST: vlsseg4e64.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x74,0xb5,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 6a <unknown>
+# CHECK-UNKNOWN: 6ab57407 <unknown>
 
 vluxseg4ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg4ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 64 <unknown>
+# CHECK-UNKNOWN: 64450407 <unknown>
 
 vluxseg4ei8.v v8, (a0), v4
 # CHECK-INST: vluxseg4ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 66 <unknown>
+# CHECK-UNKNOWN: 66450407 <unknown>
 
 vluxseg4ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg4ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 64 <unknown>
+# CHECK-UNKNOWN: 64455407 <unknown>
 
 vluxseg4ei16.v v8, (a0), v4
 # CHECK-INST: vluxseg4ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 66 <unknown>
+# CHECK-UNKNOWN: 66455407 <unknown>
 
 vluxseg4ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg4ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 64 <unknown>
+# CHECK-UNKNOWN: 64456407 <unknown>
 
 vluxseg4ei32.v v8, (a0), v4
 # CHECK-INST: vluxseg4ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 66 <unknown>
+# CHECK-UNKNOWN: 66456407 <unknown>
 
 vluxseg4ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg4ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 64 <unknown>
+# CHECK-UNKNOWN: 64457407 <unknown>
 
 vluxseg4ei64.v v8, (a0), v4
 # CHECK-INST: vluxseg4ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 66 <unknown>
+# CHECK-UNKNOWN: 66457407 <unknown>
 
 vloxseg4ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg4ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 6c <unknown>
+# CHECK-UNKNOWN: 6c450407 <unknown>
 
 vloxseg4ei8.v v8, (a0), v4
 # CHECK-INST: vloxseg4ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 6e <unknown>
+# CHECK-UNKNOWN: 6e450407 <unknown>
 
 vloxseg4ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg4ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 6c <unknown>
+# CHECK-UNKNOWN: 6c455407 <unknown>
 
 vloxseg4ei16.v v8, (a0), v4
 # CHECK-INST: vloxseg4ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 6e <unknown>
+# CHECK-UNKNOWN: 6e455407 <unknown>
 
 vloxseg4ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg4ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 6c <unknown>
+# CHECK-UNKNOWN: 6c456407 <unknown>
 
 vloxseg4ei32.v v8, (a0), v4
 # CHECK-INST: vloxseg4ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 6e <unknown>
+# CHECK-UNKNOWN: 6e456407 <unknown>
 
 vloxseg4ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg4ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 6c <unknown>
+# CHECK-UNKNOWN: 6c457407 <unknown>
 
 vloxseg4ei64.v v8, (a0), v4
 # CHECK-INST: vloxseg4ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 6e <unknown>
+# CHECK-UNKNOWN: 6e457407 <unknown>
 
 vlseg5e8.v v8, (a0), v0.t
 # CHECK-INST: vlseg5e8.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 80 <unknown>
+# CHECK-UNKNOWN: 80050407 <unknown>
 
 vlseg5e8.v v8, (a0)
 # CHECK-INST: vlseg5e8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 82 <unknown>
+# CHECK-UNKNOWN: 82050407 <unknown>
 
 vlseg5e16.v v8, (a0), v0.t
 # CHECK-INST: vlseg5e16.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 80 <unknown>
+# CHECK-UNKNOWN: 80055407 <unknown>
 
 vlseg5e16.v v8, (a0)
 # CHECK-INST: vlseg5e16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 82 <unknown>
+# CHECK-UNKNOWN: 82055407 <unknown>
 
 vlseg5e32.v v8, (a0), v0.t
 # CHECK-INST: vlseg5e32.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 80 <unknown>
+# CHECK-UNKNOWN: 80056407 <unknown>
 
 vlseg5e32.v v8, (a0)
 # CHECK-INST: vlseg5e32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 82 <unknown>
+# CHECK-UNKNOWN: 82056407 <unknown>
 
 vlseg5e64.v v8, (a0), v0.t
 # CHECK-INST: vlseg5e64.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 80 <unknown>
+# CHECK-UNKNOWN: 80057407 <unknown>
 
 vlseg5e64.v v8, (a0)
 # CHECK-INST: vlseg5e64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 82 <unknown>
+# CHECK-UNKNOWN: 82057407 <unknown>
 
 vlseg5e8ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg5e8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x81]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 81 <unknown>
+# CHECK-UNKNOWN: 81050407 <unknown>
 
 vlseg5e8ff.v v8, (a0)
 # CHECK-INST: vlseg5e8ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0x83]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 83 <unknown>
+# CHECK-UNKNOWN: 83050407 <unknown>
 
 vlseg5e16ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg5e16ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0x81]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 81 <unknown>
+# CHECK-UNKNOWN: 81055407 <unknown>
 
 vlseg5e16ff.v v8, (a0)
 # CHECK-INST: vlseg5e16ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x83]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 83 <unknown>
+# CHECK-UNKNOWN: 83055407 <unknown>
 
 vlseg5e32ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg5e32ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0x81]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 81 <unknown>
+# CHECK-UNKNOWN: 81056407 <unknown>
 
 vlseg5e32ff.v v8, (a0)
 # CHECK-INST: vlseg5e32ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0x83]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 83 <unknown>
+# CHECK-UNKNOWN: 83056407 <unknown>
 
 vlseg5e64ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg5e64ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0x81]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 81 <unknown>
+# CHECK-UNKNOWN: 81057407 <unknown>
 
 vlseg5e64ff.v v8, (a0)
 # CHECK-INST: vlseg5e64ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0x83]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 83 <unknown>
+# CHECK-UNKNOWN: 83057407 <unknown>
 
 vlsseg5e8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg5e8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 88 <unknown>
+# CHECK-UNKNOWN: 88b50407 <unknown>
 
 vlsseg5e8.v v8, (a0), a1
 # CHECK-INST: vlsseg5e8.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 8a <unknown>
+# CHECK-UNKNOWN: 8ab50407 <unknown>
 
 vlsseg5e16.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg5e16.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 88 <unknown>
+# CHECK-UNKNOWN: 88b55407 <unknown>
 
 vlsseg5e16.v v8, (a0), a1
 # CHECK-INST: vlsseg5e16.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x54,0xb5,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 8a <unknown>
+# CHECK-UNKNOWN: 8ab55407 <unknown>
 
 vlsseg5e32.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg5e32.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x64,0xb5,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 88 <unknown>
+# CHECK-UNKNOWN: 88b56407 <unknown>
 
 vlsseg5e32.v v8, (a0), a1
 # CHECK-INST: vlsseg5e32.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x64,0xb5,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 8a <unknown>
+# CHECK-UNKNOWN: 8ab56407 <unknown>
 
 vlsseg5e64.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg5e64.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x74,0xb5,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 88 <unknown>
+# CHECK-UNKNOWN: 88b57407 <unknown>
 
 vlsseg5e64.v v8, (a0), a1
 # CHECK-INST: vlsseg5e64.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x74,0xb5,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 8a <unknown>
+# CHECK-UNKNOWN: 8ab57407 <unknown>
 
 vluxseg5ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg5ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 84 <unknown>
+# CHECK-UNKNOWN: 84450407 <unknown>
 
 vluxseg5ei8.v v8, (a0), v4
 # CHECK-INST: vluxseg5ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 86 <unknown>
+# CHECK-UNKNOWN: 86450407 <unknown>
 
 vluxseg5ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg5ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 84 <unknown>
+# CHECK-UNKNOWN: 84455407 <unknown>
 
 vluxseg5ei16.v v8, (a0), v4
 # CHECK-INST: vluxseg5ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 86 <unknown>
+# CHECK-UNKNOWN: 86455407 <unknown>
 
 vluxseg5ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg5ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 84 <unknown>
+# CHECK-UNKNOWN: 84456407 <unknown>
 
 vluxseg5ei32.v v8, (a0), v4
 # CHECK-INST: vluxseg5ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 86 <unknown>
+# CHECK-UNKNOWN: 86456407 <unknown>
 
 vluxseg5ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg5ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 84 <unknown>
+# CHECK-UNKNOWN: 84457407 <unknown>
 
 vluxseg5ei64.v v8, (a0), v4
 # CHECK-INST: vluxseg5ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 86 <unknown>
+# CHECK-UNKNOWN: 86457407 <unknown>
 
 vloxseg5ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg5ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 8c <unknown>
+# CHECK-UNKNOWN: 8c450407 <unknown>
 
 vloxseg5ei8.v v8, (a0), v4
 # CHECK-INST: vloxseg5ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 8e <unknown>
+# CHECK-UNKNOWN: 8e450407 <unknown>
 
 vloxseg5ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg5ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 8c <unknown>
+# CHECK-UNKNOWN: 8c455407 <unknown>
 
 vloxseg5ei16.v v8, (a0), v4
 # CHECK-INST: vloxseg5ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 8e <unknown>
+# CHECK-UNKNOWN: 8e455407 <unknown>
 
 vloxseg5ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg5ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 8c <unknown>
+# CHECK-UNKNOWN: 8c456407 <unknown>
 
 vloxseg5ei32.v v8, (a0), v4
 # CHECK-INST: vloxseg5ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 8e <unknown>
+# CHECK-UNKNOWN: 8e456407 <unknown>
 
 vloxseg5ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg5ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 8c <unknown>
+# CHECK-UNKNOWN: 8c457407 <unknown>
 
 vloxseg5ei64.v v8, (a0), v4
 # CHECK-INST: vloxseg5ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 8e <unknown>
+# CHECK-UNKNOWN: 8e457407 <unknown>
 
 vlseg6e8.v v8, (a0), v0.t
 # CHECK-INST: vlseg6e8.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 a0 <unknown>
+# CHECK-UNKNOWN: a0050407 <unknown>
 
 vlseg6e8.v v8, (a0)
 # CHECK-INST: vlseg6e8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 a2 <unknown>
+# CHECK-UNKNOWN: a2050407 <unknown>
 
 vlseg6e16.v v8, (a0), v0.t
 # CHECK-INST: vlseg6e16.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 a0 <unknown>
+# CHECK-UNKNOWN: a0055407 <unknown>
 
 vlseg6e16.v v8, (a0)
 # CHECK-INST: vlseg6e16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 a2 <unknown>
+# CHECK-UNKNOWN: a2055407 <unknown>
 
 vlseg6e32.v v8, (a0), v0.t
 # CHECK-INST: vlseg6e32.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 a0 <unknown>
+# CHECK-UNKNOWN: a0056407 <unknown>
 
 vlseg6e32.v v8, (a0)
 # CHECK-INST: vlseg6e32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 a2 <unknown>
+# CHECK-UNKNOWN: a2056407 <unknown>
 
 vlseg6e64.v v8, (a0), v0.t
 # CHECK-INST: vlseg6e64.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 a0 <unknown>
+# CHECK-UNKNOWN: a0057407 <unknown>
 
 vlseg6e64.v v8, (a0)
 # CHECK-INST: vlseg6e64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 a2 <unknown>
+# CHECK-UNKNOWN: a2057407 <unknown>
 
 vlseg6e8ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg6e8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0xa1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 a1 <unknown>
+# CHECK-UNKNOWN: a1050407 <unknown>
 
 vlseg6e8ff.v v8, (a0)
 # CHECK-INST: vlseg6e8ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0xa3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 a3 <unknown>
+# CHECK-UNKNOWN: a3050407 <unknown>
 
 vlseg6e16ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg6e16ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0xa1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 a1 <unknown>
+# CHECK-UNKNOWN: a1055407 <unknown>
 
 vlseg6e16ff.v v8, (a0)
 # CHECK-INST: vlseg6e16ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0xa3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 a3 <unknown>
+# CHECK-UNKNOWN: a3055407 <unknown>
 
 vlseg6e32ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg6e32ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0xa1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 a1 <unknown>
+# CHECK-UNKNOWN: a1056407 <unknown>
 
 vlseg6e32ff.v v8, (a0)
 # CHECK-INST: vlseg6e32ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0xa3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 a3 <unknown>
+# CHECK-UNKNOWN: a3056407 <unknown>
 
 vlseg6e64ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg6e64ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0xa1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 a1 <unknown>
+# CHECK-UNKNOWN: a1057407 <unknown>
 
 vlseg6e64ff.v v8, (a0)
 # CHECK-INST: vlseg6e64ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0xa3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 a3 <unknown>
+# CHECK-UNKNOWN: a3057407 <unknown>
 
 vlsseg6e8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg6e8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 a8 <unknown>
+# CHECK-UNKNOWN: a8b50407 <unknown>
 
 vlsseg6e8.v v8, (a0), a1
 # CHECK-INST: vlsseg6e8.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x04,0xb5,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 aa <unknown>
+# CHECK-UNKNOWN: aab50407 <unknown>
 
 vlsseg6e16.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg6e16.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x54,0xb5,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 a8 <unknown>
+# CHECK-UNKNOWN: a8b55407 <unknown>
 
 vlsseg6e16.v v8, (a0), a1
 # CHECK-INST: vlsseg6e16.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x54,0xb5,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 aa <unknown>
+# CHECK-UNKNOWN: aab55407 <unknown>
 
 vlsseg6e32.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg6e32.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x64,0xb5,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 a8 <unknown>
+# CHECK-UNKNOWN: a8b56407 <unknown>
 
 vlsseg6e32.v v8, (a0), a1
 # CHECK-INST: vlsseg6e32.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x64,0xb5,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 aa <unknown>
+# CHECK-UNKNOWN: aab56407 <unknown>
 
 vlsseg6e64.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg6e64.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x74,0xb5,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 a8 <unknown>
+# CHECK-UNKNOWN: a8b57407 <unknown>
 
 vlsseg6e64.v v8, (a0), a1
 # CHECK-INST: vlsseg6e64.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x74,0xb5,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 aa <unknown>
+# CHECK-UNKNOWN: aab57407 <unknown>
 
 vluxseg6ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg6ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 a4 <unknown>
+# CHECK-UNKNOWN: a4450407 <unknown>
 
 vluxseg6ei8.v v8, (a0), v4
 # CHECK-INST: vluxseg6ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 a6 <unknown>
+# CHECK-UNKNOWN: a6450407 <unknown>
 
 vluxseg6ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg6ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 a4 <unknown>
+# CHECK-UNKNOWN: a4455407 <unknown>
 
 vluxseg6ei16.v v8, (a0), v4
 # CHECK-INST: vluxseg6ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 a6 <unknown>
+# CHECK-UNKNOWN: a6455407 <unknown>
 
 vluxseg6ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg6ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 a4 <unknown>
+# CHECK-UNKNOWN: a4456407 <unknown>
 
 vluxseg6ei32.v v8, (a0), v4
 # CHECK-INST: vluxseg6ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 a6 <unknown>
+# CHECK-UNKNOWN: a6456407 <unknown>
 
 vluxseg6ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg6ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 a4 <unknown>
+# CHECK-UNKNOWN: a4457407 <unknown>
 
 vluxseg6ei64.v v8, (a0), v4
 # CHECK-INST: vluxseg6ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 a6 <unknown>
+# CHECK-UNKNOWN: a6457407 <unknown>
 
 vloxseg6ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg6ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 ac <unknown>
+# CHECK-UNKNOWN: ac450407 <unknown>
 
 vloxseg6ei8.v v8, (a0), v4
 # CHECK-INST: vloxseg6ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 ae <unknown>
+# CHECK-UNKNOWN: ae450407 <unknown>
 
 vloxseg6ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg6ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 ac <unknown>
+# CHECK-UNKNOWN: ac455407 <unknown>
 
 vloxseg6ei16.v v8, (a0), v4
 # CHECK-INST: vloxseg6ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 ae <unknown>
+# CHECK-UNKNOWN: ae455407 <unknown>
 
 vloxseg6ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg6ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 ac <unknown>
+# CHECK-UNKNOWN: ac456407 <unknown>
 
 vloxseg6ei32.v v8, (a0), v4
 # CHECK-INST: vloxseg6ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 ae <unknown>
+# CHECK-UNKNOWN: ae456407 <unknown>
 
 vloxseg6ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg6ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 ac <unknown>
+# CHECK-UNKNOWN: ac457407 <unknown>
 
 vloxseg6ei64.v v8, (a0), v4
 # CHECK-INST: vloxseg6ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 ae <unknown>
+# CHECK-UNKNOWN: ae457407 <unknown>
 
 vlseg7e8.v v8, (a0), v0.t
 # CHECK-INST: vlseg7e8.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 c0 <unknown>
+# CHECK-UNKNOWN: c0050407 <unknown>
 
 vlseg7e8.v v8, (a0)
 # CHECK-INST: vlseg7e8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 c2 <unknown>
+# CHECK-UNKNOWN: c2050407 <unknown>
 
 vlseg7e16.v v8, (a0), v0.t
 # CHECK-INST: vlseg7e16.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 c0 <unknown>
+# CHECK-UNKNOWN: c0055407 <unknown>
 
 vlseg7e16.v v8, (a0)
 # CHECK-INST: vlseg7e16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 c2 <unknown>
+# CHECK-UNKNOWN: c2055407 <unknown>
 
 vlseg7e32.v v8, (a0), v0.t
 # CHECK-INST: vlseg7e32.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 c0 <unknown>
+# CHECK-UNKNOWN: c0056407 <unknown>
 
 vlseg7e32.v v8, (a0)
 # CHECK-INST: vlseg7e32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 c2 <unknown>
+# CHECK-UNKNOWN: c2056407 <unknown>
 
 vlseg7e64.v v8, (a0), v0.t
 # CHECK-INST: vlseg7e64.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 c0 <unknown>
+# CHECK-UNKNOWN: c0057407 <unknown>
 
 vlseg7e64.v v8, (a0)
 # CHECK-INST: vlseg7e64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 c2 <unknown>
+# CHECK-UNKNOWN: c2057407 <unknown>
 
 vlseg7e8ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg7e8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0xc1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 c1 <unknown>
+# CHECK-UNKNOWN: c1050407 <unknown>
 
 vlseg7e8ff.v v8, (a0)
 # CHECK-INST: vlseg7e8ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0xc3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 c3 <unknown>
+# CHECK-UNKNOWN: c3050407 <unknown>
 
 vlseg7e16ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg7e16ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0xc1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 c1 <unknown>
+# CHECK-UNKNOWN: c1055407 <unknown>
 
 vlseg7e16ff.v v8, (a0)
 # CHECK-INST: vlseg7e16ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0xc3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 c3 <unknown>
+# CHECK-UNKNOWN: c3055407 <unknown>
 
 vlseg7e32ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg7e32ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0xc1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 c1 <unknown>
+# CHECK-UNKNOWN: c1056407 <unknown>
 
 vlseg7e32ff.v v8, (a0)
 # CHECK-INST: vlseg7e32ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0xc3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 c3 <unknown>
+# CHECK-UNKNOWN: c3056407 <unknown>
 
 vlseg7e64ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg7e64ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0xc1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 c1 <unknown>
+# CHECK-UNKNOWN: c1057407 <unknown>
 
 vlseg7e64ff.v v8, (a0)
 # CHECK-INST: vlseg7e64ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0xc3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 c3 <unknown>
+# CHECK-UNKNOWN: c3057407 <unknown>
 
 vlsseg7e8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg7e8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 c8 <unknown>
+# CHECK-UNKNOWN: c8b50407 <unknown>
 
 vlsseg7e8.v v8, (a0), a1
 # CHECK-INST: vlsseg7e8.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x04,0xb5,0xca]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 ca <unknown>
+# CHECK-UNKNOWN: cab50407 <unknown>
 
 vlsseg7e16.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg7e16.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x54,0xb5,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 c8 <unknown>
+# CHECK-UNKNOWN: c8b55407 <unknown>
 
 vlsseg7e16.v v8, (a0), a1
 # CHECK-INST: vlsseg7e16.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x54,0xb5,0xca]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 ca <unknown>
+# CHECK-UNKNOWN: cab55407 <unknown>
 
 vlsseg7e32.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg7e32.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x64,0xb5,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 c8 <unknown>
+# CHECK-UNKNOWN: c8b56407 <unknown>
 
 vlsseg7e32.v v8, (a0), a1
 # CHECK-INST: vlsseg7e32.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x64,0xb5,0xca]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 ca <unknown>
+# CHECK-UNKNOWN: cab56407 <unknown>
 
 vlsseg7e64.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg7e64.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x74,0xb5,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 c8 <unknown>
+# CHECK-UNKNOWN: c8b57407 <unknown>
 
 vlsseg7e64.v v8, (a0), a1
 # CHECK-INST: vlsseg7e64.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x74,0xb5,0xca]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 ca <unknown>
+# CHECK-UNKNOWN: cab57407 <unknown>
 
 vluxseg7ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg7ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 c4 <unknown>
+# CHECK-UNKNOWN: c4450407 <unknown>
 
 vluxseg7ei8.v v8, (a0), v4
 # CHECK-INST: vluxseg7ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 c6 <unknown>
+# CHECK-UNKNOWN: c6450407 <unknown>
 
 vluxseg7ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg7ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 c4 <unknown>
+# CHECK-UNKNOWN: c4455407 <unknown>
 
 vluxseg7ei16.v v8, (a0), v4
 # CHECK-INST: vluxseg7ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 c6 <unknown>
+# CHECK-UNKNOWN: c6455407 <unknown>
 
 vluxseg7ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg7ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 c4 <unknown>
+# CHECK-UNKNOWN: c4456407 <unknown>
 
 vluxseg7ei32.v v8, (a0), v4
 # CHECK-INST: vluxseg7ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 c6 <unknown>
+# CHECK-UNKNOWN: c6456407 <unknown>
 
 vluxseg7ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg7ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 c4 <unknown>
+# CHECK-UNKNOWN: c4457407 <unknown>
 
 vluxseg7ei64.v v8, (a0), v4
 # CHECK-INST: vluxseg7ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 c6 <unknown>
+# CHECK-UNKNOWN: c6457407 <unknown>
 
 vloxseg7ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg7ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 cc <unknown>
+# CHECK-UNKNOWN: cc450407 <unknown>
 
 vloxseg7ei8.v v8, (a0), v4
 # CHECK-INST: vloxseg7ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 ce <unknown>
+# CHECK-UNKNOWN: ce450407 <unknown>
 
 vloxseg7ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg7ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 cc <unknown>
+# CHECK-UNKNOWN: cc455407 <unknown>
 
 vloxseg7ei16.v v8, (a0), v4
 # CHECK-INST: vloxseg7ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 ce <unknown>
+# CHECK-UNKNOWN: ce455407 <unknown>
 
 vloxseg7ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg7ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 cc <unknown>
+# CHECK-UNKNOWN: cc456407 <unknown>
 
 vloxseg7ei32.v v8, (a0), v4
 # CHECK-INST: vloxseg7ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 ce <unknown>
+# CHECK-UNKNOWN: ce456407 <unknown>
 
 vloxseg7ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg7ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 cc <unknown>
+# CHECK-UNKNOWN: cc457407 <unknown>
 
 vloxseg7ei64.v v8, (a0), v4
 # CHECK-INST: vloxseg7ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 ce <unknown>
+# CHECK-UNKNOWN: ce457407 <unknown>
 
 vlseg8e8.v v8, (a0), v0.t
 # CHECK-INST: vlseg8e8.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 e0 <unknown>
+# CHECK-UNKNOWN: e0050407 <unknown>
 
 vlseg8e8.v v8, (a0)
 # CHECK-INST: vlseg8e8.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 e2 <unknown>
+# CHECK-UNKNOWN: e2050407 <unknown>
 
 vlseg8e16.v v8, (a0), v0.t
 # CHECK-INST: vlseg8e16.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 e0 <unknown>
+# CHECK-UNKNOWN: e0055407 <unknown>
 
 vlseg8e16.v v8, (a0)
 # CHECK-INST: vlseg8e16.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 e2 <unknown>
+# CHECK-UNKNOWN: e2055407 <unknown>
 
 vlseg8e32.v v8, (a0), v0.t
 # CHECK-INST: vlseg8e32.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 e0 <unknown>
+# CHECK-UNKNOWN: e0056407 <unknown>
 
 vlseg8e32.v v8, (a0)
 # CHECK-INST: vlseg8e32.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 e2 <unknown>
+# CHECK-UNKNOWN: e2056407 <unknown>
 
 vlseg8e64.v v8, (a0), v0.t
 # CHECK-INST: vlseg8e64.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 e0 <unknown>
+# CHECK-UNKNOWN: e0057407 <unknown>
 
 vlseg8e64.v v8, (a0)
 # CHECK-INST: vlseg8e64.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 e2 <unknown>
+# CHECK-UNKNOWN: e2057407 <unknown>
 
 vlseg8e8ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg8e8ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0xe1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 e1 <unknown>
+# CHECK-UNKNOWN: e1050407 <unknown>
 
 vlseg8e8ff.v v8, (a0)
 # CHECK-INST: vlseg8e8ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x04,0x05,0xe3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 e3 <unknown>
+# CHECK-UNKNOWN: e3050407 <unknown>
 
 vlseg8e16ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg8e16ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x54,0x05,0xe1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 e1 <unknown>
+# CHECK-UNKNOWN: e1055407 <unknown>
 
 vlseg8e16ff.v v8, (a0)
 # CHECK-INST: vlseg8e16ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0xe3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 e3 <unknown>
+# CHECK-UNKNOWN: e3055407 <unknown>
 
 vlseg8e32ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg8e32ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x64,0x05,0xe1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 e1 <unknown>
+# CHECK-UNKNOWN: e1056407 <unknown>
 
 vlseg8e32ff.v v8, (a0)
 # CHECK-INST: vlseg8e32ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x64,0x05,0xe3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 05 e3 <unknown>
+# CHECK-UNKNOWN: e3056407 <unknown>
 
 vlseg8e64ff.v v8, (a0), v0.t
 # CHECK-INST: vlseg8e64ff.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x74,0x05,0xe1]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 e1 <unknown>
+# CHECK-UNKNOWN: e1057407 <unknown>
 
 vlseg8e64ff.v v8, (a0)
 # CHECK-INST: vlseg8e64ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x74,0x05,0xe3]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 05 e3 <unknown>
+# CHECK-UNKNOWN: e3057407 <unknown>
 
 vlsseg8e8.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg8e8.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x04,0xb5,0xe8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 e8 <unknown>
+# CHECK-UNKNOWN: e8b50407 <unknown>
 
 vlsseg8e8.v v8, (a0), a1
 # CHECK-INST: vlsseg8e8.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x04,0xb5,0xea]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 ea <unknown>
+# CHECK-UNKNOWN: eab50407 <unknown>
 
 vlsseg8e16.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg8e16.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x54,0xb5,0xe8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 e8 <unknown>
+# CHECK-UNKNOWN: e8b55407 <unknown>
 
 vlsseg8e16.v v8, (a0), a1
 # CHECK-INST: vlsseg8e16.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x54,0xb5,0xea]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 b5 ea <unknown>
+# CHECK-UNKNOWN: eab55407 <unknown>
 
 vlsseg8e32.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg8e32.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x64,0xb5,0xe8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 e8 <unknown>
+# CHECK-UNKNOWN: e8b56407 <unknown>
 
 vlsseg8e32.v v8, (a0), a1
 # CHECK-INST: vlsseg8e32.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x64,0xb5,0xea]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 b5 ea <unknown>
+# CHECK-UNKNOWN: eab56407 <unknown>
 
 vlsseg8e64.v v8, (a0), a1, v0.t
 # CHECK-INST: vlsseg8e64.v v8, (a0), a1, v0.t
 # CHECK-ENCODING: [0x07,0x74,0xb5,0xe8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 e8 <unknown>
+# CHECK-UNKNOWN: e8b57407 <unknown>
 
 vlsseg8e64.v v8, (a0), a1
 # CHECK-INST: vlsseg8e64.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x74,0xb5,0xea]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 b5 ea <unknown>
+# CHECK-UNKNOWN: eab57407 <unknown>
 
 vluxseg8ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg8ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0xe4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 e4 <unknown>
+# CHECK-UNKNOWN: e4450407 <unknown>
 
 vluxseg8ei8.v v8, (a0), v4
 # CHECK-INST: vluxseg8ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0xe6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 e6 <unknown>
+# CHECK-UNKNOWN: e6450407 <unknown>
 
 vluxseg8ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg8ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0xe4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 e4 <unknown>
+# CHECK-UNKNOWN: e4455407 <unknown>
 
 vluxseg8ei16.v v8, (a0), v4
 # CHECK-INST: vluxseg8ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0xe6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 e6 <unknown>
+# CHECK-UNKNOWN: e6455407 <unknown>
 
 vluxseg8ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg8ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0xe4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 e4 <unknown>
+# CHECK-UNKNOWN: e4456407 <unknown>
 
 vluxseg8ei32.v v8, (a0), v4
 # CHECK-INST: vluxseg8ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0xe6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 e6 <unknown>
+# CHECK-UNKNOWN: e6456407 <unknown>
 
 vluxseg8ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vluxseg8ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0xe4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 e4 <unknown>
+# CHECK-UNKNOWN: e4457407 <unknown>
 
 vluxseg8ei64.v v8, (a0), v4
 # CHECK-INST: vluxseg8ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0xe6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 e6 <unknown>
+# CHECK-UNKNOWN: e6457407 <unknown>
 
 vloxseg8ei8.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg8ei8.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x04,0x45,0xec]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 ec <unknown>
+# CHECK-UNKNOWN: ec450407 <unknown>
 
 vloxseg8ei8.v v8, (a0), v4
 # CHECK-INST: vloxseg8ei8.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x04,0x45,0xee]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 45 ee <unknown>
+# CHECK-UNKNOWN: ee450407 <unknown>
 
 vloxseg8ei16.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg8ei16.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x54,0x45,0xec]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 ec <unknown>
+# CHECK-UNKNOWN: ec455407 <unknown>
 
 vloxseg8ei16.v v8, (a0), v4
 # CHECK-INST: vloxseg8ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0xee]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 ee <unknown>
+# CHECK-UNKNOWN: ee455407 <unknown>
 
 vloxseg8ei32.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg8ei32.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x64,0x45,0xec]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 ec <unknown>
+# CHECK-UNKNOWN: ec456407 <unknown>
 
 vloxseg8ei32.v v8, (a0), v4
 # CHECK-INST: vloxseg8ei32.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x64,0x45,0xee]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 64 45 ee <unknown>
+# CHECK-UNKNOWN: ee456407 <unknown>
 
 vloxseg8ei64.v v8, (a0), v4, v0.t
 # CHECK-INST: vloxseg8ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0xec]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 ec <unknown>
+# CHECK-UNKNOWN: ec457407 <unknown>
 
 vloxseg8ei64.v v8, (a0), v4
 # CHECK-INST: vloxseg8ei64.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x74,0x45,0xee]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 ee <unknown>
+# CHECK-UNKNOWN: ee457407 <unknown>
 
 vsseg2e8.v v24, (a0), v0.t
 # CHECK-INST: vsseg2e8.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 20 <unknown>
+# CHECK-UNKNOWN: 20050c27 <unknown>
 
 vsseg2e8.v v24, (a0)
 # CHECK-INST: vsseg2e8.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 22 <unknown>
+# CHECK-UNKNOWN: 22050c27 <unknown>
 
 vsseg2e16.v v24, (a0), v0.t
 # CHECK-INST: vsseg2e16.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x05,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 20 <unknown>
+# CHECK-UNKNOWN: 20055c27 <unknown>
 
 vsseg2e16.v v24, (a0)
 # CHECK-INST: vsseg2e16.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x5c,0x05,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 22 <unknown>
+# CHECK-UNKNOWN: 22055c27 <unknown>
 
 vsseg2e32.v v24, (a0), v0.t
 # CHECK-INST: vsseg2e32.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 20 <unknown>
+# CHECK-UNKNOWN: 20056c27 <unknown>
 
 vsseg2e32.v v24, (a0)
 # CHECK-INST: vsseg2e32.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 22 <unknown>
+# CHECK-UNKNOWN: 22056c27 <unknown>
 
 vsseg2e64.v v24, (a0), v0.t
 # CHECK-INST: vsseg2e64.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x05,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 20 <unknown>
+# CHECK-UNKNOWN: 20057c27 <unknown>
 
 vsseg2e64.v v24, (a0)
 # CHECK-INST: vsseg2e64.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x7c,0x05,0x22]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 22 <unknown>
+# CHECK-UNKNOWN: 22057c27 <unknown>
 
 vssseg2e8.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg2e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 28 <unknown>
+# CHECK-UNKNOWN: 28b50c27 <unknown>
 
 vssseg2e8.v v24, (a0), a1
 # CHECK-INST: vssseg2e8.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 2a <unknown>
+# CHECK-UNKNOWN: 2ab50c27 <unknown>
 
 vssseg2e16.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg2e16.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 28 <unknown>
+# CHECK-UNKNOWN: 28b55c27 <unknown>
 
 vssseg2e16.v v24, (a0), a1
 # CHECK-INST: vssseg2e16.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 2a <unknown>
+# CHECK-UNKNOWN: 2ab55c27 <unknown>
 
 vssseg2e32.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg2e32.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 28 <unknown>
+# CHECK-UNKNOWN: 28b56c27 <unknown>
 
 vssseg2e32.v v24, (a0), a1
 # CHECK-INST: vssseg2e32.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 2a <unknown>
+# CHECK-UNKNOWN: 2ab56c27 <unknown>
 
 vssseg2e64.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg2e64.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 28 <unknown>
+# CHECK-UNKNOWN: 28b57c27 <unknown>
 
 vssseg2e64.v v24, (a0), a1
 # CHECK-INST: vssseg2e64.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 2a <unknown>
+# CHECK-UNKNOWN: 2ab57c27 <unknown>
 
 vsuxseg2ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg2ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 24 <unknown>
+# CHECK-UNKNOWN: 24450c27 <unknown>
 
 vsuxseg2ei8.v v24, (a0), v4
 # CHECK-INST: vsuxseg2ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 26 <unknown>
+# CHECK-UNKNOWN: 26450c27 <unknown>
 
 vsuxseg2ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg2ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 24 <unknown>
+# CHECK-UNKNOWN: 24455c27 <unknown>
 
 vsuxseg2ei16.v v24, (a0), v4
 # CHECK-INST: vsuxseg2ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 26 <unknown>
+# CHECK-UNKNOWN: 26455c27 <unknown>
 
 vsuxseg2ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg2ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 24 <unknown>
+# CHECK-UNKNOWN: 24456c27 <unknown>
 
 vsuxseg2ei32.v v24, (a0), v4
 # CHECK-INST: vsuxseg2ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 26 <unknown>
+# CHECK-UNKNOWN: 26456c27 <unknown>
 
 vsuxseg2ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg2ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x24]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 24 <unknown>
+# CHECK-UNKNOWN: 24457c27 <unknown>
 
 vsuxseg2ei64.v v24, (a0), v4
 # CHECK-INST: vsuxseg2ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x26]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 26 <unknown>
+# CHECK-UNKNOWN: 26457c27 <unknown>
 
 vsoxseg2ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg2ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 2c <unknown>
+# CHECK-UNKNOWN: 2c450c27 <unknown>
 
 vsoxseg2ei8.v v24, (a0), v4
 # CHECK-INST: vsoxseg2ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 2e <unknown>
+# CHECK-UNKNOWN: 2e450c27 <unknown>
 
 vsoxseg2ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg2ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 2c <unknown>
+# CHECK-UNKNOWN: 2c455c27 <unknown>
 
 vsoxseg2ei16.v v24, (a0), v4
 # CHECK-INST: vsoxseg2ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 2e <unknown>
+# CHECK-UNKNOWN: 2e455c27 <unknown>
 
 vsoxseg2ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg2ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 2c <unknown>
+# CHECK-UNKNOWN: 2c456c27 <unknown>
 
 vsoxseg2ei32.v v24, (a0), v4
 # CHECK-INST: vsoxseg2ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 2e <unknown>
+# CHECK-UNKNOWN: 2e456c27 <unknown>
 
 vsoxseg2ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg2ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x2c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 2c <unknown>
+# CHECK-UNKNOWN: 2c457c27 <unknown>
 
 vsoxseg2ei64.v v24, (a0), v4
 # CHECK-INST: vsoxseg2ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x2e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 2e <unknown>
+# CHECK-UNKNOWN: 2e457c27 <unknown>
 
 vsseg3e8.v v24, (a0), v0.t
 # CHECK-INST: vsseg3e8.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 40 <unknown>
+# CHECK-UNKNOWN: 40050c27 <unknown>
 
 vsseg3e8.v v24, (a0)
 # CHECK-INST: vsseg3e8.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 42 <unknown>
+# CHECK-UNKNOWN: 42050c27 <unknown>
 
 vsseg3e16.v v24, (a0), v0.t
 # CHECK-INST: vsseg3e16.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x05,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 40 <unknown>
+# CHECK-UNKNOWN: 40055c27 <unknown>
 
 vsseg3e16.v v24, (a0)
 # CHECK-INST: vsseg3e16.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x5c,0x05,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 42 <unknown>
+# CHECK-UNKNOWN: 42055c27 <unknown>
 
 vsseg3e32.v v24, (a0), v0.t
 # CHECK-INST: vsseg3e32.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 40 <unknown>
+# CHECK-UNKNOWN: 40056c27 <unknown>
 
 vsseg3e32.v v24, (a0)
 # CHECK-INST: vsseg3e32.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 42 <unknown>
+# CHECK-UNKNOWN: 42056c27 <unknown>
 
 vsseg3e64.v v24, (a0), v0.t
 # CHECK-INST: vsseg3e64.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x05,0x40]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 40 <unknown>
+# CHECK-UNKNOWN: 40057c27 <unknown>
 
 vsseg3e64.v v24, (a0)
 # CHECK-INST: vsseg3e64.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x7c,0x05,0x42]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 42 <unknown>
+# CHECK-UNKNOWN: 42057c27 <unknown>
 
 vssseg3e8.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg3e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 48 <unknown>
+# CHECK-UNKNOWN: 48b50c27 <unknown>
 
 vssseg3e8.v v24, (a0), a1
 # CHECK-INST: vssseg3e8.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 4a <unknown>
+# CHECK-UNKNOWN: 4ab50c27 <unknown>
 
 vssseg3e16.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg3e16.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 48 <unknown>
+# CHECK-UNKNOWN: 48b55c27 <unknown>
 
 vssseg3e16.v v24, (a0), a1
 # CHECK-INST: vssseg3e16.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 4a <unknown>
+# CHECK-UNKNOWN: 4ab55c27 <unknown>
 
 vssseg3e32.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg3e32.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 48 <unknown>
+# CHECK-UNKNOWN: 48b56c27 <unknown>
 
 vssseg3e32.v v24, (a0), a1
 # CHECK-INST: vssseg3e32.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 4a <unknown>
+# CHECK-UNKNOWN: 4ab56c27 <unknown>
 
 vssseg3e64.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg3e64.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0x48]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 48 <unknown>
+# CHECK-UNKNOWN: 48b57c27 <unknown>
 
 vssseg3e64.v v24, (a0), a1
 # CHECK-INST: vssseg3e64.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0x4a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 4a <unknown>
+# CHECK-UNKNOWN: 4ab57c27 <unknown>
 
 vsuxseg3ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg3ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 44 <unknown>
+# CHECK-UNKNOWN: 44450c27 <unknown>
 
 vsuxseg3ei8.v v24, (a0), v4
 # CHECK-INST: vsuxseg3ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 46 <unknown>
+# CHECK-UNKNOWN: 46450c27 <unknown>
 
 vsuxseg3ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg3ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 44 <unknown>
+# CHECK-UNKNOWN: 44455c27 <unknown>
 
 vsuxseg3ei16.v v24, (a0), v4
 # CHECK-INST: vsuxseg3ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 46 <unknown>
+# CHECK-UNKNOWN: 46455c27 <unknown>
 
 vsuxseg3ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg3ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 44 <unknown>
+# CHECK-UNKNOWN: 44456c27 <unknown>
 
 vsuxseg3ei32.v v24, (a0), v4
 # CHECK-INST: vsuxseg3ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 46 <unknown>
+# CHECK-UNKNOWN: 46456c27 <unknown>
 
 vsuxseg3ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg3ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x44]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 44 <unknown>
+# CHECK-UNKNOWN: 44457c27 <unknown>
 
 vsuxseg3ei64.v v24, (a0), v4
 # CHECK-INST: vsuxseg3ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 46 <unknown>
+# CHECK-UNKNOWN: 46457c27 <unknown>
 
 vsoxseg3ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg3ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 4c <unknown>
+# CHECK-UNKNOWN: 4c450c27 <unknown>
 
 vsoxseg3ei8.v v24, (a0), v4
 # CHECK-INST: vsoxseg3ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 4e <unknown>
+# CHECK-UNKNOWN: 4e450c27 <unknown>
 
 vsoxseg3ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg3ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 4c <unknown>
+# CHECK-UNKNOWN: 4c455c27 <unknown>
 
 vsoxseg3ei16.v v24, (a0), v4
 # CHECK-INST: vsoxseg3ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 4e <unknown>
+# CHECK-UNKNOWN: 4e455c27 <unknown>
 
 vsoxseg3ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg3ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 4c <unknown>
+# CHECK-UNKNOWN: 4c456c27 <unknown>
 
 vsoxseg3ei32.v v24, (a0), v4
 # CHECK-INST: vsoxseg3ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 4e <unknown>
+# CHECK-UNKNOWN: 4e456c27 <unknown>
 
 vsoxseg3ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg3ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x4c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 4c <unknown>
+# CHECK-UNKNOWN: 4c457c27 <unknown>
 
 vsoxseg3ei64.v v24, (a0), v4
 # CHECK-INST: vsoxseg3ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x4e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 4e <unknown>
+# CHECK-UNKNOWN: 4e457c27 <unknown>
 
 vsseg4e8.v v24, (a0), v0.t
 # CHECK-INST: vsseg4e8.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 60 <unknown>
+# CHECK-UNKNOWN: 60050c27 <unknown>
 
 vsseg4e8.v v24, (a0)
 # CHECK-INST: vsseg4e8.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 62 <unknown>
+# CHECK-UNKNOWN: 62050c27 <unknown>
 
 vsseg4e16.v v24, (a0), v0.t
 # CHECK-INST: vsseg4e16.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x05,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 60 <unknown>
+# CHECK-UNKNOWN: 60055c27 <unknown>
 
 vsseg4e16.v v24, (a0)
 # CHECK-INST: vsseg4e16.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x5c,0x05,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 62 <unknown>
+# CHECK-UNKNOWN: 62055c27 <unknown>
 
 vsseg4e32.v v24, (a0), v0.t
 # CHECK-INST: vsseg4e32.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 60 <unknown>
+# CHECK-UNKNOWN: 60056c27 <unknown>
 
 vsseg4e32.v v24, (a0)
 # CHECK-INST: vsseg4e32.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 62 <unknown>
+# CHECK-UNKNOWN: 62056c27 <unknown>
 
 vsseg4e64.v v24, (a0), v0.t
 # CHECK-INST: vsseg4e64.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x05,0x60]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 60 <unknown>
+# CHECK-UNKNOWN: 60057c27 <unknown>
 
 vsseg4e64.v v24, (a0)
 # CHECK-INST: vsseg4e64.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x7c,0x05,0x62]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 62 <unknown>
+# CHECK-UNKNOWN: 62057c27 <unknown>
 
 vssseg4e8.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg4e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 68 <unknown>
+# CHECK-UNKNOWN: 68b50c27 <unknown>
 
 vssseg4e8.v v24, (a0), a1
 # CHECK-INST: vssseg4e8.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 6a <unknown>
+# CHECK-UNKNOWN: 6ab50c27 <unknown>
 
 vssseg4e16.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg4e16.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 68 <unknown>
+# CHECK-UNKNOWN: 68b55c27 <unknown>
 
 vssseg4e16.v v24, (a0), a1
 # CHECK-INST: vssseg4e16.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 6a <unknown>
+# CHECK-UNKNOWN: 6ab55c27 <unknown>
 
 vssseg4e32.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg4e32.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 68 <unknown>
+# CHECK-UNKNOWN: 68b56c27 <unknown>
 
 vssseg4e32.v v24, (a0), a1
 # CHECK-INST: vssseg4e32.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 6a <unknown>
+# CHECK-UNKNOWN: 6ab56c27 <unknown>
 
 vssseg4e64.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg4e64.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0x68]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 68 <unknown>
+# CHECK-UNKNOWN: 68b57c27 <unknown>
 
 vssseg4e64.v v24, (a0), a1
 # CHECK-INST: vssseg4e64.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0x6a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 6a <unknown>
+# CHECK-UNKNOWN: 6ab57c27 <unknown>
 
 vsuxseg4ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg4ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 64 <unknown>
+# CHECK-UNKNOWN: 64450c27 <unknown>
 
 vsuxseg4ei8.v v24, (a0), v4
 # CHECK-INST: vsuxseg4ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 66 <unknown>
+# CHECK-UNKNOWN: 66450c27 <unknown>
 
 vsuxseg4ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg4ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 64 <unknown>
+# CHECK-UNKNOWN: 64455c27 <unknown>
 
 vsuxseg4ei16.v v24, (a0), v4
 # CHECK-INST: vsuxseg4ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 66 <unknown>
+# CHECK-UNKNOWN: 66455c27 <unknown>
 
 vsuxseg4ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg4ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 64 <unknown>
+# CHECK-UNKNOWN: 64456c27 <unknown>
 
 vsuxseg4ei32.v v24, (a0), v4
 # CHECK-INST: vsuxseg4ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 66 <unknown>
+# CHECK-UNKNOWN: 66456c27 <unknown>
 
 vsuxseg4ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg4ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x64]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 64 <unknown>
+# CHECK-UNKNOWN: 64457c27 <unknown>
 
 vsuxseg4ei64.v v24, (a0), v4
 # CHECK-INST: vsuxseg4ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x66]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 66 <unknown>
+# CHECK-UNKNOWN: 66457c27 <unknown>
 
 vsoxseg4ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg4ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 6c <unknown>
+# CHECK-UNKNOWN: 6c450c27 <unknown>
 
 vsoxseg4ei8.v v24, (a0), v4
 # CHECK-INST: vsoxseg4ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 6e <unknown>
+# CHECK-UNKNOWN: 6e450c27 <unknown>
 
 vsoxseg4ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg4ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 6c <unknown>
+# CHECK-UNKNOWN: 6c455c27 <unknown>
 
 vsoxseg4ei16.v v24, (a0), v4
 # CHECK-INST: vsoxseg4ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 6e <unknown>
+# CHECK-UNKNOWN: 6e455c27 <unknown>
 
 vsoxseg4ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg4ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 6c <unknown>
+# CHECK-UNKNOWN: 6c456c27 <unknown>
 
 vsoxseg4ei32.v v24, (a0), v4
 # CHECK-INST: vsoxseg4ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 6e <unknown>
+# CHECK-UNKNOWN: 6e456c27 <unknown>
 
 vsoxseg4ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg4ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 6c <unknown>
+# CHECK-UNKNOWN: 6c457c27 <unknown>
 
 vsoxseg4ei64.v v24, (a0), v4
 # CHECK-INST: vsoxseg4ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x6e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 6e <unknown>
+# CHECK-UNKNOWN: 6e457c27 <unknown>
 
 vsseg5e8.v v24, (a0), v0.t
 # CHECK-INST: vsseg5e8.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 80 <unknown>
+# CHECK-UNKNOWN: 80050c27 <unknown>
 
 vsseg5e8.v v24, (a0)
 # CHECK-INST: vsseg5e8.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x05,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 82 <unknown>
+# CHECK-UNKNOWN: 82050c27 <unknown>
 
 vsseg5e16.v v24, (a0), v0.t
 # CHECK-INST: vsseg5e16.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x05,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 80 <unknown>
+# CHECK-UNKNOWN: 80055c27 <unknown>
 
 vsseg5e16.v v24, (a0)
 # CHECK-INST: vsseg5e16.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x5c,0x05,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 82 <unknown>
+# CHECK-UNKNOWN: 82055c27 <unknown>
 
 vsseg5e32.v v24, (a0), v0.t
 # CHECK-INST: vsseg5e32.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 80 <unknown>
+# CHECK-UNKNOWN: 80056c27 <unknown>
 
 vsseg5e32.v v24, (a0)
 # CHECK-INST: vsseg5e32.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 82 <unknown>
+# CHECK-UNKNOWN: 82056c27 <unknown>
 
 vsseg5e64.v v24, (a0), v0.t
 # CHECK-INST: vsseg5e64.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x05,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 80 <unknown>
+# CHECK-UNKNOWN: 80057c27 <unknown>
 
 vsseg5e64.v v24, (a0)
 # CHECK-INST: vsseg5e64.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x7c,0x05,0x82]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 82 <unknown>
+# CHECK-UNKNOWN: 82057c27 <unknown>
 
 vssseg5e8.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg5e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 88 <unknown>
+# CHECK-UNKNOWN: 88b50c27 <unknown>
 
 vssseg5e8.v v24, (a0), a1
 # CHECK-INST: vssseg5e8.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 8a <unknown>
+# CHECK-UNKNOWN: 8ab50c27 <unknown>
 
 vssseg5e16.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg5e16.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 88 <unknown>
+# CHECK-UNKNOWN: 88b55c27 <unknown>
 
 vssseg5e16.v v24, (a0), a1
 # CHECK-INST: vssseg5e16.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 8a <unknown>
+# CHECK-UNKNOWN: 8ab55c27 <unknown>
 
 vssseg5e32.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg5e32.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 88 <unknown>
+# CHECK-UNKNOWN: 88b56c27 <unknown>
 
 vssseg5e32.v v24, (a0), a1
 # CHECK-INST: vssseg5e32.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 8a <unknown>
+# CHECK-UNKNOWN: 8ab56c27 <unknown>
 
 vssseg5e64.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg5e64.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0x88]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 88 <unknown>
+# CHECK-UNKNOWN: 88b57c27 <unknown>
 
 vssseg5e64.v v24, (a0), a1
 # CHECK-INST: vssseg5e64.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0x8a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 8a <unknown>
+# CHECK-UNKNOWN: 8ab57c27 <unknown>
 
 vsuxseg5ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg5ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 84 <unknown>
+# CHECK-UNKNOWN: 84450c27 <unknown>
 
 vsuxseg5ei8.v v24, (a0), v4
 # CHECK-INST: vsuxseg5ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 86 <unknown>
+# CHECK-UNKNOWN: 86450c27 <unknown>
 
 vsuxseg5ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg5ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 84 <unknown>
+# CHECK-UNKNOWN: 84455c27 <unknown>
 
 vsuxseg5ei16.v v24, (a0), v4
 # CHECK-INST: vsuxseg5ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 86 <unknown>
+# CHECK-UNKNOWN: 86455c27 <unknown>
 
 vsuxseg5ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg5ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 84 <unknown>
+# CHECK-UNKNOWN: 84456c27 <unknown>
 
 vsuxseg5ei32.v v24, (a0), v4
 # CHECK-INST: vsuxseg5ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 86 <unknown>
+# CHECK-UNKNOWN: 86456c27 <unknown>
 
 vsuxseg5ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg5ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x84]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 84 <unknown>
+# CHECK-UNKNOWN: 84457c27 <unknown>
 
 vsuxseg5ei64.v v24, (a0), v4
 # CHECK-INST: vsuxseg5ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x86]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 86 <unknown>
+# CHECK-UNKNOWN: 86457c27 <unknown>
 
 vsoxseg5ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg5ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 8c <unknown>
+# CHECK-UNKNOWN: 8c450c27 <unknown>
 
 vsoxseg5ei8.v v24, (a0), v4
 # CHECK-INST: vsoxseg5ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 8e <unknown>
+# CHECK-UNKNOWN: 8e450c27 <unknown>
 
 vsoxseg5ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg5ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 8c <unknown>
+# CHECK-UNKNOWN: 8c455c27 <unknown>
 
 vsoxseg5ei16.v v24, (a0), v4
 # CHECK-INST: vsoxseg5ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 8e <unknown>
+# CHECK-UNKNOWN: 8e455c27 <unknown>
 
 vsoxseg5ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg5ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 8c <unknown>
+# CHECK-UNKNOWN: 8c456c27 <unknown>
 
 vsoxseg5ei32.v v24, (a0), v4
 # CHECK-INST: vsoxseg5ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 8e <unknown>
+# CHECK-UNKNOWN: 8e456c27 <unknown>
 
 vsoxseg5ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg5ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x8c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 8c <unknown>
+# CHECK-UNKNOWN: 8c457c27 <unknown>
 
 vsoxseg5ei64.v v24, (a0), v4
 # CHECK-INST: vsoxseg5ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0x8e]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 8e <unknown>
+# CHECK-UNKNOWN: 8e457c27 <unknown>
 
 vsseg6e8.v v24, (a0), v0.t
 # CHECK-INST: vsseg6e8.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x05,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 a0 <unknown>
+# CHECK-UNKNOWN: a0050c27 <unknown>
 
 vsseg6e8.v v24, (a0)
 # CHECK-INST: vsseg6e8.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x05,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 a2 <unknown>
+# CHECK-UNKNOWN: a2050c27 <unknown>
 
 vsseg6e16.v v24, (a0), v0.t
 # CHECK-INST: vsseg6e16.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x05,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 a0 <unknown>
+# CHECK-UNKNOWN: a0055c27 <unknown>
 
 vsseg6e16.v v24, (a0)
 # CHECK-INST: vsseg6e16.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x5c,0x05,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 a2 <unknown>
+# CHECK-UNKNOWN: a2055c27 <unknown>
 
 vsseg6e32.v v24, (a0), v0.t
 # CHECK-INST: vsseg6e32.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x05,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 a0 <unknown>
+# CHECK-UNKNOWN: a0056c27 <unknown>
 
 vsseg6e32.v v24, (a0)
 # CHECK-INST: vsseg6e32.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x6c,0x05,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 a2 <unknown>
+# CHECK-UNKNOWN: a2056c27 <unknown>
 
 vsseg6e64.v v24, (a0), v0.t
 # CHECK-INST: vsseg6e64.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x05,0xa0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 a0 <unknown>
+# CHECK-UNKNOWN: a0057c27 <unknown>
 
 vsseg6e64.v v24, (a0)
 # CHECK-INST: vsseg6e64.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x7c,0x05,0xa2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 a2 <unknown>
+# CHECK-UNKNOWN: a2057c27 <unknown>
 
 vssseg6e8.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg6e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 a8 <unknown>
+# CHECK-UNKNOWN: a8b50c27 <unknown>
 
 vssseg6e8.v v24, (a0), a1
 # CHECK-INST: vssseg6e8.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 aa <unknown>
+# CHECK-UNKNOWN: aab50c27 <unknown>
 
 vssseg6e16.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg6e16.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 a8 <unknown>
+# CHECK-UNKNOWN: a8b55c27 <unknown>
 
 vssseg6e16.v v24, (a0), a1
 # CHECK-INST: vssseg6e16.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 aa <unknown>
+# CHECK-UNKNOWN: aab55c27 <unknown>
 
 vssseg6e32.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg6e32.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 a8 <unknown>
+# CHECK-UNKNOWN: a8b56c27 <unknown>
 
 vssseg6e32.v v24, (a0), a1
 # CHECK-INST: vssseg6e32.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 aa <unknown>
+# CHECK-UNKNOWN: aab56c27 <unknown>
 
 vssseg6e64.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg6e64.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0xa8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 a8 <unknown>
+# CHECK-UNKNOWN: a8b57c27 <unknown>
 
 vssseg6e64.v v24, (a0), a1
 # CHECK-INST: vssseg6e64.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0xaa]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 aa <unknown>
+# CHECK-UNKNOWN: aab57c27 <unknown>
 
 vsuxseg6ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg6ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 a4 <unknown>
+# CHECK-UNKNOWN: a4450c27 <unknown>
 
 vsuxseg6ei8.v v24, (a0), v4
 # CHECK-INST: vsuxseg6ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 a6 <unknown>
+# CHECK-UNKNOWN: a6450c27 <unknown>
 
 vsuxseg6ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg6ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 a4 <unknown>
+# CHECK-UNKNOWN: a4455c27 <unknown>
 
 vsuxseg6ei16.v v24, (a0), v4
 # CHECK-INST: vsuxseg6ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 a6 <unknown>
+# CHECK-UNKNOWN: a6455c27 <unknown>
 
 vsuxseg6ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg6ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 a4 <unknown>
+# CHECK-UNKNOWN: a4456c27 <unknown>
 
 vsuxseg6ei32.v v24, (a0), v4
 # CHECK-INST: vsuxseg6ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 a6 <unknown>
+# CHECK-UNKNOWN: a6456c27 <unknown>
 
 vsuxseg6ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg6ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 a4 <unknown>
+# CHECK-UNKNOWN: a4457c27 <unknown>
 
 vsuxseg6ei64.v v24, (a0), v4
 # CHECK-INST: vsuxseg6ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xa6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 a6 <unknown>
+# CHECK-UNKNOWN: a6457c27 <unknown>
 
 vsoxseg6ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg6ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 ac <unknown>
+# CHECK-UNKNOWN: ac450c27 <unknown>
 
 vsoxseg6ei8.v v24, (a0), v4
 # CHECK-INST: vsoxseg6ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 ae <unknown>
+# CHECK-UNKNOWN: ae450c27 <unknown>
 
 vsoxseg6ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg6ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 ac <unknown>
+# CHECK-UNKNOWN: ac455c27 <unknown>
 
 vsoxseg6ei16.v v24, (a0), v4
 # CHECK-INST: vsoxseg6ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 ae <unknown>
+# CHECK-UNKNOWN: ae455c27 <unknown>
 
 vsoxseg6ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg6ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 ac <unknown>
+# CHECK-UNKNOWN: ac456c27 <unknown>
 
 vsoxseg6ei32.v v24, (a0), v4
 # CHECK-INST: vsoxseg6ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 ae <unknown>
+# CHECK-UNKNOWN: ae456c27 <unknown>
 
 vsoxseg6ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg6ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xac]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 ac <unknown>
+# CHECK-UNKNOWN: ac457c27 <unknown>
 
 vsoxseg6ei64.v v24, (a0), v4
 # CHECK-INST: vsoxseg6ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xae]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 ae <unknown>
+# CHECK-UNKNOWN: ae457c27 <unknown>
 
 vsseg7e8.v v24, (a0), v0.t
 # CHECK-INST: vsseg7e8.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x05,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 c0 <unknown>
+# CHECK-UNKNOWN: c0050c27 <unknown>
 
 vsseg7e8.v v24, (a0)
 # CHECK-INST: vsseg7e8.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x05,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 c2 <unknown>
+# CHECK-UNKNOWN: c2050c27 <unknown>
 
 vsseg7e16.v v24, (a0), v0.t
 # CHECK-INST: vsseg7e16.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x05,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 c0 <unknown>
+# CHECK-UNKNOWN: c0055c27 <unknown>
 
 vsseg7e16.v v24, (a0)
 # CHECK-INST: vsseg7e16.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x5c,0x05,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 c2 <unknown>
+# CHECK-UNKNOWN: c2055c27 <unknown>
 
 vsseg7e32.v v24, (a0), v0.t
 # CHECK-INST: vsseg7e32.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x05,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 c0 <unknown>
+# CHECK-UNKNOWN: c0056c27 <unknown>
 
 vsseg7e32.v v24, (a0)
 # CHECK-INST: vsseg7e32.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x6c,0x05,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 c2 <unknown>
+# CHECK-UNKNOWN: c2056c27 <unknown>
 
 vsseg7e64.v v24, (a0), v0.t
 # CHECK-INST: vsseg7e64.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x05,0xc0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 c0 <unknown>
+# CHECK-UNKNOWN: c0057c27 <unknown>
 
 vsseg7e64.v v24, (a0)
 # CHECK-INST: vsseg7e64.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x7c,0x05,0xc2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 c2 <unknown>
+# CHECK-UNKNOWN: c2057c27 <unknown>
 
 vssseg7e8.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg7e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 c8 <unknown>
+# CHECK-UNKNOWN: c8b50c27 <unknown>
 
 vssseg7e8.v v24, (a0), a1
 # CHECK-INST: vssseg7e8.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0xca]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 ca <unknown>
+# CHECK-UNKNOWN: cab50c27 <unknown>
 
 vssseg7e16.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg7e16.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 c8 <unknown>
+# CHECK-UNKNOWN: c8b55c27 <unknown>
 
 vssseg7e16.v v24, (a0), a1
 # CHECK-INST: vssseg7e16.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0xca]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 ca <unknown>
+# CHECK-UNKNOWN: cab55c27 <unknown>
 
 vssseg7e32.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg7e32.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 c8 <unknown>
+# CHECK-UNKNOWN: c8b56c27 <unknown>
 
 vssseg7e32.v v24, (a0), a1
 # CHECK-INST: vssseg7e32.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0xca]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 ca <unknown>
+# CHECK-UNKNOWN: cab56c27 <unknown>
 
 vssseg7e64.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg7e64.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0xc8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 c8 <unknown>
+# CHECK-UNKNOWN: c8b57c27 <unknown>
 
 vssseg7e64.v v24, (a0), a1
 # CHECK-INST: vssseg7e64.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0xca]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 ca <unknown>
+# CHECK-UNKNOWN: cab57c27 <unknown>
 
 vsuxseg7ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg7ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 c4 <unknown>
+# CHECK-UNKNOWN: c4450c27 <unknown>
 
 vsuxseg7ei8.v v24, (a0), v4
 # CHECK-INST: vsuxseg7ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 c6 <unknown>
+# CHECK-UNKNOWN: c6450c27 <unknown>
 
 vsuxseg7ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg7ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 c4 <unknown>
+# CHECK-UNKNOWN: c4455c27 <unknown>
 
 vsuxseg7ei16.v v24, (a0), v4
 # CHECK-INST: vsuxseg7ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 c6 <unknown>
+# CHECK-UNKNOWN: c6455c27 <unknown>
 
 vsuxseg7ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg7ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 c4 <unknown>
+# CHECK-UNKNOWN: c4456c27 <unknown>
 
 vsuxseg7ei32.v v24, (a0), v4
 # CHECK-INST: vsuxseg7ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 c6 <unknown>
+# CHECK-UNKNOWN: c6456c27 <unknown>
 
 vsuxseg7ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg7ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xc4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 c4 <unknown>
+# CHECK-UNKNOWN: c4457c27 <unknown>
 
 vsuxseg7ei64.v v24, (a0), v4
 # CHECK-INST: vsuxseg7ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xc6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 c6 <unknown>
+# CHECK-UNKNOWN: c6457c27 <unknown>
 
 vsoxseg7ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg7ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 cc <unknown>
+# CHECK-UNKNOWN: cc450c27 <unknown>
 
 vsoxseg7ei8.v v24, (a0), v4
 # CHECK-INST: vsoxseg7ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 ce <unknown>
+# CHECK-UNKNOWN: ce450c27 <unknown>
 
 vsoxseg7ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg7ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 cc <unknown>
+# CHECK-UNKNOWN: cc455c27 <unknown>
 
 vsoxseg7ei16.v v24, (a0), v4
 # CHECK-INST: vsoxseg7ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 ce <unknown>
+# CHECK-UNKNOWN: ce455c27 <unknown>
 
 vsoxseg7ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg7ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 cc <unknown>
+# CHECK-UNKNOWN: cc456c27 <unknown>
 
 vsoxseg7ei32.v v24, (a0), v4
 # CHECK-INST: vsoxseg7ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 ce <unknown>
+# CHECK-UNKNOWN: ce456c27 <unknown>
 
 vsoxseg7ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg7ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xcc]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 cc <unknown>
+# CHECK-UNKNOWN: cc457c27 <unknown>
 
 vsoxseg7ei64.v v24, (a0), v4
 # CHECK-INST: vsoxseg7ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 ce <unknown>
+# CHECK-UNKNOWN: ce457c27 <unknown>
 
 vsseg8e8.v v24, (a0), v0.t
 # CHECK-INST: vsseg8e8.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x05,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 e0 <unknown>
+# CHECK-UNKNOWN: e0050c27 <unknown>
 
 vsseg8e8.v v24, (a0)
 # CHECK-INST: vsseg8e8.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x0c,0x05,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 05 e2 <unknown>
+# CHECK-UNKNOWN: e2050c27 <unknown>
 
 vsseg8e16.v v24, (a0), v0.t
 # CHECK-INST: vsseg8e16.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x05,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 e0 <unknown>
+# CHECK-UNKNOWN: e0055c27 <unknown>
 
 vsseg8e16.v v24, (a0)
 # CHECK-INST: vsseg8e16.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x5c,0x05,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 05 e2 <unknown>
+# CHECK-UNKNOWN: e2055c27 <unknown>
 
 vsseg8e32.v v24, (a0), v0.t
 # CHECK-INST: vsseg8e32.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x05,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 e0 <unknown>
+# CHECK-UNKNOWN: e0056c27 <unknown>
 
 vsseg8e32.v v24, (a0)
 # CHECK-INST: vsseg8e32.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x6c,0x05,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 e2 <unknown>
+# CHECK-UNKNOWN: e2056c27 <unknown>
 
 vsseg8e64.v v24, (a0), v0.t
 # CHECK-INST: vsseg8e64.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x05,0xe0]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 e0 <unknown>
+# CHECK-UNKNOWN: e0057c27 <unknown>
 
 vsseg8e64.v v24, (a0)
 # CHECK-INST: vsseg8e64.v v24, (a0)
 # CHECK-ENCODING: [0x27,0x7c,0x05,0xe2]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 05 e2 <unknown>
+# CHECK-UNKNOWN: e2057c27 <unknown>
 
 vssseg8e8.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg8e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0xe8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 e8 <unknown>
+# CHECK-UNKNOWN: e8b50c27 <unknown>
 
 vssseg8e8.v v24, (a0), a1
 # CHECK-INST: vssseg8e8.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0xea]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 ea <unknown>
+# CHECK-UNKNOWN: eab50c27 <unknown>
 
 vssseg8e16.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg8e16.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0xe8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 e8 <unknown>
+# CHECK-UNKNOWN: e8b55c27 <unknown>
 
 vssseg8e16.v v24, (a0), a1
 # CHECK-INST: vssseg8e16.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x5c,0xb5,0xea]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c b5 ea <unknown>
+# CHECK-UNKNOWN: eab55c27 <unknown>
 
 vssseg8e32.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg8e32.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0xe8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 e8 <unknown>
+# CHECK-UNKNOWN: e8b56c27 <unknown>
 
 vssseg8e32.v v24, (a0), a1
 # CHECK-INST: vssseg8e32.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x6c,0xb5,0xea]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c b5 ea <unknown>
+# CHECK-UNKNOWN: eab56c27 <unknown>
 
 vssseg8e64.v v24, (a0), a1, v0.t
 # CHECK-INST: vssseg8e64.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0xe8]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 e8 <unknown>
+# CHECK-UNKNOWN: e8b57c27 <unknown>
 
 vssseg8e64.v v24, (a0), a1
 # CHECK-INST: vssseg8e64.v v24, (a0), a1
 # CHECK-ENCODING: [0x27,0x7c,0xb5,0xea]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c b5 ea <unknown>
+# CHECK-UNKNOWN: eab57c27 <unknown>
 
 vsuxseg8ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg8ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xe4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 e4 <unknown>
+# CHECK-UNKNOWN: e4450c27 <unknown>
 
 vsuxseg8ei8.v v24, (a0), v4
 # CHECK-INST: vsuxseg8ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xe6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 e6 <unknown>
+# CHECK-UNKNOWN: e6450c27 <unknown>
 
 vsuxseg8ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg8ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xe4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 e4 <unknown>
+# CHECK-UNKNOWN: e4455c27 <unknown>
 
 vsuxseg8ei16.v v24, (a0), v4
 # CHECK-INST: vsuxseg8ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xe6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 e6 <unknown>
+# CHECK-UNKNOWN: e6455c27 <unknown>
 
 vsuxseg8ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg8ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xe4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 e4 <unknown>
+# CHECK-UNKNOWN: e4456c27 <unknown>
 
 vsuxseg8ei32.v v24, (a0), v4
 # CHECK-INST: vsuxseg8ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xe6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 e6 <unknown>
+# CHECK-UNKNOWN: e6456c27 <unknown>
 
 vsuxseg8ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsuxseg8ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xe4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 e4 <unknown>
+# CHECK-UNKNOWN: e4457c27 <unknown>
 
 vsuxseg8ei64.v v24, (a0), v4
 # CHECK-INST: vsuxseg8ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xe6]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 e6 <unknown>
+# CHECK-UNKNOWN: e6457c27 <unknown>
 
 vsoxseg8ei8.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg8ei8.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xec]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 ec <unknown>
+# CHECK-UNKNOWN: ec450c27 <unknown>
 
 vsoxseg8ei8.v v24, (a0), v4
 # CHECK-INST: vsoxseg8ei8.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x0c,0x45,0xee]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c 45 ee <unknown>
+# CHECK-UNKNOWN: ee450c27 <unknown>
 
 vsoxseg8ei16.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg8ei16.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xec]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 ec <unknown>
+# CHECK-UNKNOWN: ec455c27 <unknown>
 
 vsoxseg8ei16.v v24, (a0), v4
 # CHECK-INST: vsoxseg8ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xee]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 ee <unknown>
+# CHECK-UNKNOWN: ee455c27 <unknown>
 
 vsoxseg8ei32.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg8ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xec]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 ec <unknown>
+# CHECK-UNKNOWN: ec456c27 <unknown>
 
 vsoxseg8ei32.v v24, (a0), v4
 # CHECK-INST: vsoxseg8ei32.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xee]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 ee <unknown>
+# CHECK-UNKNOWN: ee456c27 <unknown>
 
 vsoxseg8ei64.v v24, (a0), v4, v0.t
 # CHECK-INST: vsoxseg8ei64.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xec]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 ec <unknown>
+# CHECK-UNKNOWN: ec457c27 <unknown>
 
 vsoxseg8ei64.v v24, (a0), v4
 # CHECK-INST: vsoxseg8ei64.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x7c,0x45,0xee]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 7c 45 ee <unknown>
+# CHECK-UNKNOWN: ee457c27 <unknown>
 
 vlseg2e8.v v8, 0(a0), v0.t
 # CHECK-INST: vlseg2e8.v v8, (a0), v0.t
 # CHECK-ENCODING: [0x07,0x04,0x05,0x20]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 05 20 <unknown>
+# CHECK-UNKNOWN: 20050407 <unknown>
 
 vlseg2e16ff.v v8, 0(a0)
 # CHECK-INST: vlseg2e16ff.v v8, (a0)
 # CHECK-ENCODING: [0x07,0x54,0x05,0x23]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 05 23 <unknown>
+# CHECK-UNKNOWN: 23055407 <unknown>
 
 vlsseg2e8.v v8, 0(a0), a1
 # CHECK-INST: vlsseg2e8.v v8, (a0), a1
 # CHECK-ENCODING: [0x07,0x04,0xb5,0x2a]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 04 b5 2a <unknown>
+# CHECK-UNKNOWN: 2ab50407 <unknown>
 
 vluxseg3ei16.v v8, 0(a0), v4
 # CHECK-INST: vluxseg3ei16.v v8, (a0), v4
 # CHECK-ENCODING: [0x07,0x54,0x45,0x46]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 54 45 46 <unknown>
+# CHECK-UNKNOWN: 46455407 <unknown>
 
 vloxseg4ei64.v v8, 0(a0), v4, v0.t
 # CHECK-INST: vloxseg4ei64.v v8, (a0), v4, v0.t
 # CHECK-ENCODING: [0x07,0x74,0x45,0x6c]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors) or 'Zve64x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 07 74 45 6c <unknown>
+# CHECK-UNKNOWN: 6c457407 <unknown>
 
 vsseg5e32.v v24, 0(a0), v0.t
 # CHECK-INST: vsseg5e32.v v24, (a0), v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x05,0x80]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 05 80 <unknown>
+# CHECK-UNKNOWN: 80056c27 <unknown>
 
 vssseg2e8.v v24, 0(a0), a1, v0.t
 # CHECK-INST: vssseg2e8.v v24, (a0), a1, v0.t
 # CHECK-ENCODING: [0x27,0x0c,0xb5,0x28]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 0c b5 28 <unknown>
+# CHECK-UNKNOWN: 28b50c27 <unknown>
 
 vsoxseg7ei16.v v24, 0(a0), v4
 # CHECK-INST: vsoxseg7ei16.v v24, (a0), v4
 # CHECK-ENCODING: [0x27,0x5c,0x45,0xce]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 5c 45 ce <unknown>
+# CHECK-UNKNOWN: ce455c27 <unknown>
 
 vsuxseg6ei32.v v24, 0(a0), v4, v0.t
 # CHECK-INST: vsuxseg6ei32.v v24, (a0), v4, v0.t
 # CHECK-ENCODING: [0x27,0x6c,0x45,0xa4]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
-# CHECK-UNKNOWN: 27 6c 45 a4 <unknown>
+# CHECK-UNKNOWN: a4456c27 <unknown>
diff --git a/llvm/test/MC/WebAssembly/global-ctor-dtor.ll b/llvm/test/MC/WebAssembly/global-ctor-dtor.ll
index bc1be7931349..f1ec71da1ebb 100644
--- a/llvm/test/MC/WebAssembly/global-ctor-dtor.ll
+++ b/llvm/test/MC/WebAssembly/global-ctor-dtor.ll
@@ -80,29 +80,29 @@ declare void @func3()
 ; CHECK-NEXT:         Offset:          0x1D
 ; CHECK-NEXT:       - Type:            R_WASM_FUNCTION_INDEX_LEB
 ; CHECK-NEXT:         Index:           6
-; CHECK-NEXT:         Offset:          0x2C
+; CHECK-NEXT:         Offset:          0x2B
 ; CHECK-NEXT:       - Type:            R_WASM_TABLE_INDEX_SLEB
 ; CHECK-NEXT:         Index:           5
-; CHECK-NEXT:         Offset:          0x37
+; CHECK-NEXT:         Offset:          0x36
 ; CHECK-NEXT:       - Type:            R_WASM_MEMORY_ADDR_SLEB
 ; CHECK-NEXT:         Index:           3
-; CHECK-NEXT:         Offset:          0x3F
+; CHECK-NEXT:         Offset:          0x3E
 ; CHECK-NEXT:       - Type:            R_WASM_FUNCTION_INDEX_LEB
 ; CHECK-NEXT:         Index:           4
-; CHECK-NEXT:         Offset:          0x45
+; CHECK-NEXT:         Offset:          0x44
 ; CHECK-NEXT:     Functions:
 ; CHECK-NEXT:       - Index:           5
 ; CHECK-NEXT:         Locals:
 ; CHECK-NEXT:         Body:            1080808080000B
 ; CHECK-NEXT:       - Index:           6
 ; CHECK-NEXT:         Locals:
-; CHECK-NEXT:         Body:            02404181808080004100418080808000108180808000450D0000000B0B
+; CHECK-NEXT:         Body:            02404181808080004100418080808000108180808000450D00000B0B
 ; CHECK-NEXT:       - Index:           7
 ; CHECK-NEXT:         Locals:
 ; CHECK-NEXT:         Body:            1082808080000B
 ; CHECK-NEXT:       - Index:           8
 ; CHECK-NEXT:         Locals:
-; CHECK-NEXT:         Body:            02404182808080004100418080808000108180808000450D0000000B0B
+; CHECK-NEXT:         Body:            02404182808080004100418080808000108180808000450D00000B0B
 ; CHECK-NEXT:   - Type:            DATA
 ; CHECK-NEXT:     Segments:
 ; CHECK-NEXT:       - SectionOffset:   6
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 51fb93daa4df..ebfed7b687e2 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -224,12 +224,14 @@
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Running analysis: ShouldNotRunFunctionPassesAnalysis
-; CHECK-O-NEXT: Running pass: CoroSplitPass
+; CHECK-DEFAULT-NEXT: Running pass: CoroSplitPass
+; CHECK-LTO-NOT: Running pass: CoroSplitPass
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
-; CHECK-O-NEXT: Running pass: CoroCleanupPass
+; CHECK-DEFAULT-NEXT: Running pass: CoroCleanupPass
+; CHECK-LTO-NOT: Running pass: CoroCleanupPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
 ; CHECK-DEFAULT-NEXT: Running pass: EliminateAvailableExternallyPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index 6486639e07b4..e2fd74306f80 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -183,12 +183,10 @@
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Running analysis: ShouldNotRunFunctionPassesAnalysis
-; CHECK-O-NEXT: Running pass: CoroSplitPass
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
-; CHECK-O-NEXT: Running pass: CoroCleanupPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
 ; CHECK-EXT: Running pass: {{.*}}::Bye
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index 09f9f0f48bad..13a63bbe4d9c 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -182,12 +182,10 @@
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Running analysis: ShouldNotRunFunctionPassesAnalysis
-; CHECK-O-NEXT: Running pass: CoroSplitPass
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
-; CHECK-O-NEXT: Running pass: CoroCleanupPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis on bar
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index 47bdbfd2d357..3130da86fa99 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -147,12 +147,10 @@
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Running analysis: ShouldNotRunFunctionPassesAnalysis
-; CHECK-O-NEXT: Running pass: CoroSplitPass
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: ShouldNotRunFunctionPassesAnalysis
 ; CHECK-O-NEXT: Invalidating analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
-; CHECK-O-NEXT: Running pass: CoroCleanupPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
 ; CHECK-O-NEXT: Running pass: AnnotationRemarksPass on foo
diff --git a/llvm/test/TableGen/GlobalISelEmitter-frameindex.td b/llvm/test/TableGen/GlobalISelEmitter-frameindex.td
new file mode 100644
index 000000000000..232691465bb3
--- /dev/null
+++ b/llvm/test/TableGen/GlobalISelEmitter-frameindex.td
@@ -0,0 +1,29 @@
+// RUN: llvm-tblgen -gen-global-isel -optimize-match-table=false -I %p/../../include -I %p/Common %s -o - < %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+include "GlobalISelEmitterCommon.td"
+
+def ADD : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2), []>;
+
+//===- Test a simple pattern with frame index operands. ----------------------===//
+//
+// CHECK:       GIM_Try, /*On fail goto*//*Label [[LABEL_NUM:[0-9]+]]*/ GIMT_Encode4([[LABEL:[0-9]+]]),
+// CHECK-NEXT:    GIM_CheckNumOperands, /*MI*/0, /*Expected*/2,
+// CHECK-NEXT:    GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_FRAME_INDEX),
+// CHECK-NEXT:    // MIs[0] DstI[dst]
+// CHECK-NEXT:    GIM_RootCheckType, /*Op*/0, /*Type*/GILLT_p0s32,
+// CHECK-NEXT:    GIM_RootCheckRegBankForClass, /*Op*/0, /*RC*/GIMT_Encode2(MyTarget::GPR32RegClassID),
+// CHECK-NEXT:    // MIs[0] fi
+// CHECK-NEXT:    // No operand predicates
+// CHECK-NEXT:    // (frameindex:{ *:[i32] }):$fi  =>  (ADD:{ *:[i32] } (tframeindex:{ *:[i32] }):$fi, 0:{ *:[i32] })
+// CHECK-NEXT:    GIR_BuildRootMI, /*Opcode*/GIMT_Encode2(MyTarget::ADD),
+// CHECK-NEXT:    GIR_RootToRootCopy, /*OpIdx*/0, // DstI[dst]
+// CHECK-NEXT:    GIR_RootToRootCopy, /*OpIdx*/1, // fi
+// CHECK-NEXT:    GIR_AddImm8, /*InsnID*/0, /*Imm*/0,
+// CHECK-NEXT:    GIR_RootConstrainSelectedInstOperands,
+// CHECK-NEXT:    // GIR_Coverage, 0,
+// CHECK-NEXT:    GIR_EraseRootFromParent_Done,
+// CHECK-NEXT:  // Label [[LABEL_NUM]]: @[[LABEL]]
+// CHECK-NEXT:  GIM_Reject,
+
+def : Pat<(p0 frameindex:$fi), (ADD tframeindex:$fi, 0)>;
diff --git a/llvm/test/TableGen/riscv-target-def.td b/llvm/test/TableGen/riscv-target-def.td
index b23c7e4d4019..7f3d9bdb278c 100644
--- a/llvm/test/TableGen/riscv-target-def.td
+++ b/llvm/test/TableGen/riscv-target-def.td
@@ -51,6 +51,15 @@ def Feature64Bit
 def FeatureDummy
     : SubtargetFeature<"dummy", "Dummy", "true", "Dummy">;
 
+class RISCVProfile<string name, list<SubtargetFeature> features>
+    : SubtargetFeature<name, "Is" # NAME, "true",
+                       "RISC-V " # name # " profile", features>;
+
+def RVI20U32 : RISCVProfile<"rvi20u32", [Feature32Bit, FeatureStdExtI]>;
+def RVI20U64 : RISCVProfile<"rvi20u64", [Feature64Bit, FeatureStdExtI]>;
+def ProfileDummy : RISCVProfile<"dummy", [Feature64Bit, FeatureStdExtI,
+                                          FeatureStdExtF, FeatureStdExtZidummy]>;
+
 class RISCVProcessorModel<string n,
                           SchedMachineModel m,
                           list<SubtargetFeature> f,
@@ -83,6 +92,7 @@ def ROCKET_RV32 : RISCVProcessorModel<"rocket-rv32",
                                        FeatureStdExtI,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtZicsr,
+                                       FeatureStdExtZidummy,
                                        FeatureDummy]>;
 def ROCKET_RV64 : RISCVProcessorModel<"rocket-rv64",
                                       NoSchedModel,
@@ -90,6 +100,7 @@ def ROCKET_RV64 : RISCVProcessorModel<"rocket-rv64",
                                        FeatureStdExtI,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtZicsr,
+                                       FeatureStdExtZidummy,
                                        FeatureDummy]>;
 def ROCKET : RISCVTuneProcessorModel<"rocket",
                                      NoSchedModel>;
@@ -113,22 +124,31 @@ def ROCKET : RISCVTuneProcessorModel<"rocket",
 // CHECK:      #ifdef GET_IMPLIED_EXTENSIONS
 // CHECK-NEXT: #undef GET_IMPLIED_EXTENSIONS
 
-// CHECK:      static const char *ImpliedExtsF[] = {"zicsr"};
-
 // CHECK:      static constexpr ImpliedExtsEntry ImpliedExts[] = {
-// CHECK-NEXT:     { {"f"}, {ImpliedExtsF} },
+// CHECK-NEXT:       { {"f"}, "zicsr"},
 // CHECK-NEXT: };
 
 // CHECK:      #endif // GET_IMPLIED_EXTENSIONS
 
+// CHECK:      #ifdef GET_SUPPORTED_PROFILES
+// CHECK-NEXT: #undef GET_SUPPORTED_PROFILES
+
+// CHECK:      static constexpr RISCVProfile SupportedProfiles[] = {
+// CHECK-NEXT:     {"dummy","rv64i2p1_f2p2_zidummy0p1"},
+// CHECK-NEXT:     {"rvi20u32","rv32i2p1"},
+// CHECK-NEXT:     {"rvi20u64","rv64i2p1"},
+// CHECK-NEXT: };
+
+// CHECK:      #endif // GET_SUPPORTED_PROFILES
+
 // CHECK:      #ifndef PROC
 // CHECK-NEXT: #define PROC(ENUM, NAME, DEFAULT_MARCH, FAST_UNALIGNED_ACCESS)
 // CHECK-NEXT: #endif
 
 // CHECK:      PROC(GENERIC_RV32, {"generic-rv32"}, {"rv32i2p1"}, 0)
 // CHECK-NEXT: PROC(GENERIC_RV64, {"generic-rv64"}, {"rv64i2p1"}, 0)
-// CHECK-NEXT: PROC(ROCKET_RV32, {"rocket-rv32"}, {"rv32i2p1_zicsr2p0_zifencei2p0"}, 0)
-// CHECK-NEXT: PROC(ROCKET_RV64, {"rocket-rv64"}, {"rv64i2p1_zicsr2p0_zifencei2p0"}, 0)
+// CHECK-NEXT: PROC(ROCKET_RV32, {"rocket-rv32"}, {"rv32i2p1_zicsr2p0_zidummy0p1_zifencei2p0"}, 0)
+// CHECK-NEXT: PROC(ROCKET_RV64, {"rocket-rv64"}, {"rv64i2p1_zicsr2p0_zidummy0p1_zifencei2p0"}, 0)
 
 // CHECK: #undef PROC
 
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-load.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-load.ll
index fd5a2044db48..b1497aefe9b9 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-load.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-load.ll
@@ -6,8 +6,7 @@
 define float @load_atomic_f32_global_system(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define float @load_atomic_f32_global_system(
 ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i32, ptr addrspace(1) [[PTR]] seq_cst, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[TMP1]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic float, ptr addrspace(1) [[PTR]] seq_cst, align 4, !some.unknown.md [[META0:![0-9]+]]
 ; CHECK-NEXT:    ret float [[TMP2]]
 ;
   %ld = load atomic float, ptr addrspace(1) %ptr seq_cst, align 4, !some.unknown.md !0
@@ -17,8 +16,7 @@ define float @load_atomic_f32_global_system(ptr addrspace(1) %ptr) {
 define float @load_atomic_f32_global_agent(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define float @load_atomic_f32_global_agent(
 ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i32, ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[TMP1]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic float, ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 4, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret float [[TMP2]]
 ;
   %ld = load atomic float, ptr addrspace(1) %ptr syncscope("agent") seq_cst, align 4, !some.unknown.md !0
@@ -28,8 +26,7 @@ define float @load_atomic_f32_global_agent(ptr addrspace(1) %ptr) {
 define float @load_atomic_f32_local(ptr addrspace(3) %ptr) {
 ; CHECK-LABEL: define float @load_atomic_f32_local(
 ; CHECK-SAME: ptr addrspace(3) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i32, ptr addrspace(3) [[PTR]] seq_cst, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[TMP1]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic float, ptr addrspace(3) [[PTR]] seq_cst, align 4, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret float [[TMP2]]
 ;
   %ld = load atomic float, ptr addrspace(3) %ptr seq_cst, align 4, !some.unknown.md !0
@@ -39,8 +36,7 @@ define float @load_atomic_f32_local(ptr addrspace(3) %ptr) {
 define float @load_atomic_f32_flat_system(ptr %ptr) {
 ; CHECK-LABEL: define float @load_atomic_f32_flat_system(
 ; CHECK-SAME: ptr [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR]] seq_cst, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[TMP1]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic float, ptr [[PTR]] seq_cst, align 4, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret float [[TMP2]]
 ;
   %ld = load atomic float, ptr %ptr seq_cst, align 4, !some.unknown.md !0
@@ -50,8 +46,7 @@ define float @load_atomic_f32_flat_system(ptr %ptr) {
 define float @load_atomic_f32_flat_agent(ptr %ptr) {
 ; CHECK-LABEL: define float @load_atomic_f32_flat_agent(
 ; CHECK-SAME: ptr [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR]] syncscope("agent") seq_cst, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[TMP1]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic float, ptr [[PTR]] syncscope("agent") seq_cst, align 4, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret float [[TMP2]]
 ;
   %ld = load atomic float, ptr %ptr syncscope("agent") seq_cst, align 4, !some.unknown.md !0
@@ -61,8 +56,7 @@ define float @load_atomic_f32_flat_agent(ptr %ptr) {
 define half @load_atomic_f16_global_system(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define half @load_atomic_f16_global_system(
 ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i16, ptr addrspace(1) [[PTR]] seq_cst, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[TMP1]] to half
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic half, ptr addrspace(1) [[PTR]] seq_cst, align 4, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret half [[TMP2]]
 ;
   %ld = load atomic half, ptr addrspace(1) %ptr seq_cst, align 4, !some.unknown.md !0
@@ -72,8 +66,7 @@ define half @load_atomic_f16_global_system(ptr addrspace(1) %ptr) {
 define half @load_atomic_f16_global_agent(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define half @load_atomic_f16_global_agent(
 ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i16, ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[TMP1]] to half
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic half, ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 4, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret half [[TMP2]]
 ;
   %ld = load atomic half, ptr addrspace(1) %ptr syncscope("agent") seq_cst, align 4, !some.unknown.md !0
@@ -83,8 +76,7 @@ define half @load_atomic_f16_global_agent(ptr addrspace(1) %ptr) {
 define half @load_atomic_f16_local(ptr addrspace(3) %ptr) {
 ; CHECK-LABEL: define half @load_atomic_f16_local(
 ; CHECK-SAME: ptr addrspace(3) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i16, ptr addrspace(3) [[PTR]] seq_cst, align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[TMP1]] to half
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic half, ptr addrspace(3) [[PTR]] seq_cst, align 2, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret half [[TMP2]]
 ;
   %ld = load atomic half, ptr addrspace(3) %ptr seq_cst, align 2, !some.unknown.md !0
@@ -94,8 +86,7 @@ define half @load_atomic_f16_local(ptr addrspace(3) %ptr) {
 define bfloat @load_atomic_bf16_global_system(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define bfloat @load_atomic_bf16_global_system(
 ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i16, ptr addrspace(1) [[PTR]] seq_cst, align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[TMP1]] to bfloat
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic bfloat, ptr addrspace(1) [[PTR]] seq_cst, align 2, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret bfloat [[TMP2]]
 ;
   %ld = load atomic bfloat, ptr addrspace(1) %ptr seq_cst, align 2, !some.unknown.md !0
@@ -105,8 +96,7 @@ define bfloat @load_atomic_bf16_global_system(ptr addrspace(1) %ptr) {
 define bfloat @load_atomic_bf16_global_agent(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define bfloat @load_atomic_bf16_global_agent(
 ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i16, ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[TMP1]] to bfloat
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic bfloat, ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 2, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret bfloat [[TMP2]]
 ;
   %ld = load atomic bfloat, ptr addrspace(1) %ptr syncscope("agent") seq_cst, align 2, !some.unknown.md !0
@@ -116,8 +106,7 @@ define bfloat @load_atomic_bf16_global_agent(ptr addrspace(1) %ptr) {
 define bfloat @load_atomic_bf16_local(ptr addrspace(3) %ptr) {
 ; CHECK-LABEL: define bfloat @load_atomic_bf16_local(
 ; CHECK-SAME: ptr addrspace(3) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i16, ptr addrspace(3) [[PTR]] seq_cst, align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[TMP1]] to bfloat
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic bfloat, ptr addrspace(3) [[PTR]] seq_cst, align 2, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret bfloat [[TMP2]]
 ;
   %ld = load atomic bfloat, ptr addrspace(3) %ptr seq_cst, align 2, !some.unknown.md !0
@@ -127,8 +116,7 @@ define bfloat @load_atomic_bf16_local(ptr addrspace(3) %ptr) {
 define bfloat @load_atomic_bf16_flat(ptr %ptr) {
 ; CHECK-LABEL: define bfloat @load_atomic_bf16_flat(
 ; CHECK-SAME: ptr [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i16, ptr [[PTR]] seq_cst, align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[TMP1]] to bfloat
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic bfloat, ptr [[PTR]] seq_cst, align 2, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret bfloat [[TMP2]]
 ;
   %ld = load atomic bfloat, ptr %ptr seq_cst, align 2, !some.unknown.md !0
@@ -138,8 +126,7 @@ define bfloat @load_atomic_bf16_flat(ptr %ptr) {
 define double @load_atomic_f64_global_system(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define double @load_atomic_f64_global_system(
 ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i64, ptr addrspace(1) [[PTR]] seq_cst, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[TMP1]] to double
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic double, ptr addrspace(1) [[PTR]] seq_cst, align 8, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret double [[TMP2]]
 ;
   %ld = load atomic double, ptr addrspace(1) %ptr seq_cst, align 8, !some.unknown.md !0
@@ -149,8 +136,7 @@ define double @load_atomic_f64_global_system(ptr addrspace(1) %ptr) {
 define double @load_atomic_f64_global_agent(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define double @load_atomic_f64_global_agent(
 ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i64, ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[TMP1]] to double
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic double, ptr addrspace(1) [[PTR]] syncscope("agent") seq_cst, align 8, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret double [[TMP2]]
 ;
   %ld = load atomic double, ptr addrspace(1) %ptr syncscope("agent") seq_cst, align 8, !some.unknown.md !0
@@ -160,8 +146,7 @@ define double @load_atomic_f64_global_agent(ptr addrspace(1) %ptr) {
 define double @load_atomic_f64_local(ptr addrspace(3) %ptr) {
 ; CHECK-LABEL: define double @load_atomic_f64_local(
 ; CHECK-SAME: ptr addrspace(3) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i64, ptr addrspace(3) [[PTR]] seq_cst, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[TMP1]] to double
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic double, ptr addrspace(3) [[PTR]] seq_cst, align 8, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret double [[TMP2]]
 ;
   %ld = load atomic double, ptr addrspace(3) %ptr seq_cst, align 8, !some.unknown.md !0
@@ -171,8 +156,7 @@ define double @load_atomic_f64_local(ptr addrspace(3) %ptr) {
 define double @load_atomic_f64_flat_system(ptr %ptr) {
 ; CHECK-LABEL: define double @load_atomic_f64_flat_system(
 ; CHECK-SAME: ptr [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i64, ptr [[PTR]] seq_cst, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[TMP1]] to double
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic double, ptr [[PTR]] seq_cst, align 8, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret double [[TMP2]]
 ;
   %ld = load atomic double, ptr %ptr seq_cst, align 8, !some.unknown.md !0
@@ -182,8 +166,7 @@ define double @load_atomic_f64_flat_system(ptr %ptr) {
 define double @load_atomic_f64_flat_agent(ptr %ptr) {
 ; CHECK-LABEL: define double @load_atomic_f64_flat_agent(
 ; CHECK-SAME: ptr [[PTR:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i64, ptr [[PTR]] syncscope("agent") seq_cst, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[TMP1]] to double
+; CHECK-NEXT:    [[TMP2:%.*]] = load atomic double, ptr [[PTR]] syncscope("agent") seq_cst, align 8, !some.unknown.md [[META0]]
 ; CHECK-NEXT:    ret double [[TMP2]]
 ;
   %ld = load atomic double, ptr %ptr syncscope("agent") seq_cst, align 8, !some.unknown.md !0
@@ -193,3 +176,6 @@ define double @load_atomic_f64_flat_agent(ptr %ptr) {
 !0 = !{}
 
 
+;.
+; CHECK: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/CallSiteSplitting/callsite-split-debug.ll b/llvm/test/Transforms/CallSiteSplitting/callsite-split-debug.ll
index 8f10dcb30d7b..68c906d616c9 100644
--- a/llvm/test/Transforms/CallSiteSplitting/callsite-split-debug.ll
+++ b/llvm/test/Transforms/CallSiteSplitting/callsite-split-debug.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -passes=callsite-splitting -o - < %s | FileCheck %s
+; RUN: opt -S -passes=callsite-splitting -o - < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DEBUG
 ; RUN: opt -S -strip-debug -passes=callsite-splitting -o - < %s | FileCheck %s
 
 define internal i16 @bar(i16 %p1, i16 %p2) {
@@ -8,6 +8,9 @@ define internal i16 @bar(i16 %p1, i16 %p2) {
 
 define i16 @foo(i16 %in) {
 bb0:
+  %a = alloca i16, align 4, !DIAssignID !12
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !11, metadata !DIExpression(), metadata !12, metadata ptr %a, metadata !DIExpression()), !dbg !8
+  store i16 7, ptr %a, align 4, !DIAssignID !13
   br label %bb1
 
 bb1:
@@ -20,13 +23,21 @@ bb2:
 CallsiteBB:
   %1 = phi i16 [ 0, %bb1 ], [ 1, %bb2 ]
   %c = phi i16 [ 2, %bb1 ], [ 3, %bb2 ]
+  %p = phi ptr [ %a, %bb1 ], [ %a, %bb2 ]
+  call void @llvm.dbg.value(metadata i16 %1, metadata !7, metadata !DIExpression()), !dbg !8
   call void @llvm.dbg.value(metadata i16 %c, metadata !7, metadata !DIExpression()), !dbg !8
+  call void @llvm.dbg.value(metadata !DIArgList(i16 %1, i16 %c), metadata !7, metadata !DIExpression()), !dbg !8
+  call void @llvm.dbg.value(metadata !DIArgList(i16 %c, i16 %c), metadata !7, metadata !DIExpression()), !dbg !8
+  call void @llvm.dbg.assign(metadata i16 %1, metadata !11, metadata !DIExpression(), metadata !13, metadata ptr %a, metadata !DIExpression()), !dbg !8
+  call void @llvm.dbg.assign(metadata i16 %c, metadata !11, metadata !DIExpression(), metadata !13, metadata ptr %a, metadata !DIExpression()), !dbg !8
+  call void @llvm.dbg.assign(metadata i16 %1, metadata !11, metadata !DIExpression(), metadata !13, metadata ptr %p, metadata !DIExpression()), !dbg !8
   %2 = call i16 @bar(i16 %1, i16 5)
   ret i16 %2
 }
 
 ; Function Attrs: nounwind readnone speculatable
 declare void @llvm.dbg.value(metadata, metadata, metadata) #0
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
 
 attributes #0 = { nounwind readnone speculatable }
 
@@ -43,14 +54,37 @@ attributes #0 = { nounwind readnone speculatable }
 !6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 4, unit: !0)
 !7 = !DILocalVariable(name: "c", scope: !6, line: 5, type: !5)
 !8 = !DILocation(line: 5, column: 7, scope: !6)
+!11 = !DILocalVariable(name: "a", scope: !6, line: 6, type: !5)
+!12 = distinct !DIAssignID()
+!13 = distinct !DIAssignID()
 
 ; The optimization should trigger even in the presence of the dbg.value in
 ; CallSiteBB.
 
 ; CHECK-LABEL: @foo
 ; CHECK-LABEL: bb1.split:
+; CHECK-DEBUG: call void @llvm.dbg.value(metadata i16 0, metadata ![[DBG_1:[0-9]+]], {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.value(metadata i16 2, metadata ![[DBG_1]], {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.value(metadata !DIArgList(i16 0, i16 2), {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.value(metadata !DIArgList(i16 2, i16 2), {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.assign(metadata i16 0, metadata ![[DBG_2:[0-9]+]], {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.assign(metadata i16 2, metadata ![[DBG_2]], {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.assign(metadata i16 0, metadata ![[DBG_2]], metadata !DIExpression(), metadata ![[ID_1:[0-9]+]], metadata ptr %a, {{.*}}
 ; CHECK: [[TMP1:%[0-9]+]] = call i16 @bar(i16 0, i16 5)
+
 ; CHECK-LABEL: bb2.split:
+; CHECK-DEBUG: call void @llvm.dbg.value(metadata i16 1, metadata ![[DBG_1]], {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.value(metadata i16 3, metadata ![[DBG_1]], {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.value(metadata !DIArgList(i16 1, i16 3), {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.value(metadata !DIArgList(i16 3, i16 3), {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.assign(metadata i16 1, metadata ![[DBG_2]], {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.assign(metadata i16 3, metadata ![[DBG_2]], {{.*}}
+; CHECK-DEBUG: call void @llvm.dbg.assign(metadata i16 1, metadata ![[DBG_2]], metadata !DIExpression(), metadata ![[ID_1:[0-9]+]], metadata ptr %a, {{.*}}
 ; CHECK: [[TMP2:%[0-9]+]] = call i16 @bar(i16 1, i16 5)
+
 ; CHECK-LABEL: CallsiteBB
 ; CHECK: %phi.call = phi i16 [ [[TMP2]], %bb2.split ], [ [[TMP1]], %bb1.split
+
+; CHECK-DEBUG-DAG: ![[DBG_1]] = !DILocalVariable(name: "c"{{.*}})
+; CHECK-DEBUG-DAG: ![[DBG_2]] = !DILocalVariable(name: "a"{{.*}})
+; CHECK-DEBUG-DAG: ![[ID_1]] = distinct !DIAssignID()
diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll
index ff5cef7e781f..25dfb3c53a07 100644
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/branch-on-zero.ll
@@ -211,6 +211,29 @@ else:
   ret i32 %l
 }
 
+define i32 @sub10_else_drop_nuw(i32 %a) {
+; CHECK-LABEL: @sub10_else_drop_nuw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L:%.*]] = sub i32 [[A:%.*]], 10
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[L]], 0
+; CHECK-NEXT:    br i1 [[TMP0]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    ret i32 0
+; CHECK:       else:
+; CHECK-NEXT:    ret i32 [[L]]
+;
+entry:
+  %c = icmp eq i32 %a, 10
+  br i1 %c, label %then, label %else
+
+then:
+  ret i32 0
+
+else:
+  %l = sub nuw i32 %a, 10
+  ret i32 %l
+}
+
 define i32 @subm10_then(i32 %a) {
 ; CHECK-LABEL: @subm10_then(
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/CodeGenPrepare/RISCV/convert-to-eqz.ll b/llvm/test/Transforms/CodeGenPrepare/RISCV/convert-to-eqz.ll
new file mode 100644
index 000000000000..a6909d149134
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/RISCV/convert-to-eqz.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -codegenprepare -S -mtriple=riscv64 < %s | FileCheck %s
+
+define i8 @hoist_add(i8 %x) {
+; CHECK-LABEL: define i8 @hoist_add(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INC:%.*]] = add i8 [[X]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i8 [[INC]], 0
+; CHECK-NEXT:    br i1 [[TMP0]], label [[EXIT:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[INC]], [[IF_THEN]] ], [ -1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+entry:
+  %cmp = icmp eq i8 %x, -1
+  br i1 %cmp, label %exit, label %if.then
+
+if.then:
+  %inc = add nuw nsw i8 %x, 1
+  br label %exit
+
+exit:
+  %retval = phi i8 [ %inc, %if.then ], [ -1, %entry ]
+  ret i8 %retval
+}
+
+define i8 @hoist_lshr(i8 %x) {
+; CHECK-LABEL: define i8 @hoist_lshr(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INC:%.*]] = lshr i8 [[X]], 3
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i8 [[INC]], 0
+; CHECK-NEXT:    br i1 [[TMP0]], label [[EXIT:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[INC]], [[IF_THEN]] ], [ -1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+entry:
+  %cmp = icmp ult i8 %x, 8
+  br i1 %cmp, label %exit, label %if.then
+
+if.then:
+  %inc = lshr exact i8 %x, 3
+  br label %exit
+
+exit:
+  %retval = phi i8 [ %inc, %if.then ], [ -1, %entry ]
+  ret i8 %retval
+}
+
+define i8 @nomove_add(i8 %x) {
+; CHECK-LABEL: define i8 @nomove_add(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INC:%.*]] = add i8 [[X]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i8 [[INC]], 0
+; CHECK-NEXT:    br i1 [[TMP0]], label [[EXIT:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[INC]], [[IF_THEN]] ], [ -1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+entry:
+  %inc = add nuw nsw i8 %x, 1
+  %cmp = icmp eq i8 %x, -1
+  br i1 %cmp, label %exit, label %if.then
+
+if.then:
+  br label %exit
+
+exit:
+  %retval = phi i8 [ %inc, %if.then ], [ -1, %entry ]
+  ret i8 %retval
+}
diff --git a/llvm/test/Transforms/FunctionSpecialization/discover-transitive-phis.ll b/llvm/test/Transforms/FunctionSpecialization/discover-transitive-phis.ll
index b4c24715037b..d0095231a30f 100644
--- a/llvm/test/Transforms/FunctionSpecialization/discover-transitive-phis.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/discover-transitive-phis.ll
@@ -1,22 +1,22 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ;
 ; RUN: opt -passes="ipsccp<func-spec>" -funcspec-min-function-size=20 -funcspec-for-literal-constant -S < %s | FileCheck %s --check-prefix=FUNCSPEC
 ; RUN: opt -passes="ipsccp<func-spec>" -funcspec-min-function-size=20 -funcspec-for-literal-constant -funcspec-max-discovery-iterations=16 -S < %s | FileCheck %s --check-prefix=NOFUNCSPEC
 
 define i64 @bar(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5, i1 %c6, i1 %c7, i1 %c8, i1 %c9, i1 %c10) {
-; FUNCSPEC-LABEL: define i64 @bar(
+; FUNCSPEC-LABEL: define range(i64 4, 13) i64 @bar(
 ; FUNCSPEC-SAME: i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]], i1 [[C4:%.*]], i1 [[C5:%.*]], i1 [[C6:%.*]], i1 [[C7:%.*]], i1 [[C8:%.*]], i1 [[C9:%.*]], i1 [[C10:%.*]]) {
 ; FUNCSPEC-NEXT:  entry:
-; FUNCSPEC-NEXT:    [[F1:%.*]] = call i64 @foo.specialized.1(i64 3, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]]), !range [[RNG0:![0-9]+]]
-; FUNCSPEC-NEXT:    [[F2:%.*]] = call i64 @foo.specialized.2(i64 4, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]]), !range [[RNG1:![0-9]+]]
+; FUNCSPEC-NEXT:    [[F1:%.*]] = call i64 @foo.specialized.1(i64 3, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]])
+; FUNCSPEC-NEXT:    [[F2:%.*]] = call i64 @foo.specialized.2(i64 4, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]])
 ; FUNCSPEC-NEXT:    [[ADD:%.*]] = add nuw nsw i64 [[F1]], [[F2]]
 ; FUNCSPEC-NEXT:    ret i64 [[ADD]]
 ;
-; NOFUNCSPEC-LABEL: define i64 @bar(
+; NOFUNCSPEC-LABEL: define range(i64 4, 13) i64 @bar(
 ; NOFUNCSPEC-SAME: i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]], i1 [[C4:%.*]], i1 [[C5:%.*]], i1 [[C6:%.*]], i1 [[C7:%.*]], i1 [[C8:%.*]], i1 [[C9:%.*]], i1 [[C10:%.*]]) {
 ; NOFUNCSPEC-NEXT:  entry:
-; NOFUNCSPEC-NEXT:    [[F1:%.*]] = call i64 @foo(i64 3, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]]), !range [[RNG0:![0-9]+]]
-; NOFUNCSPEC-NEXT:    [[F2:%.*]] = call i64 @foo(i64 4, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]]), !range [[RNG0]]
+; NOFUNCSPEC-NEXT:    [[F1:%.*]] = call i64 @foo(i64 3, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]])
+; NOFUNCSPEC-NEXT:    [[F2:%.*]] = call i64 @foo(i64 4, i1 [[C1]], i1 [[C2]], i1 [[C3]], i1 [[C4]], i1 [[C5]], i1 [[C6]], i1 [[C7]], i1 [[C8]], i1 [[C9]], i1 [[C10]])
 ; NOFUNCSPEC-NEXT:    [[ADD:%.*]] = add nuw nsw i64 [[F1]], [[F2]]
 ; NOFUNCSPEC-NEXT:    ret i64 [[ADD]]
 ;
@@ -28,6 +28,50 @@ entry:
 }
 
 define internal i64 @foo(i64 %n, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5, i1 %c6, i1 %c7, i1 %c8, i1 %c9, i1 %c10) {
+; NOFUNCSPEC-LABEL: define internal range(i64 2, 7) i64 @foo(
+; NOFUNCSPEC-SAME: i64 [[N:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]], i1 [[C4:%.*]], i1 [[C5:%.*]], i1 [[C6:%.*]], i1 [[C7:%.*]], i1 [[C8:%.*]], i1 [[C9:%.*]], i1 [[C10:%.*]]) {
+; NOFUNCSPEC-NEXT:  entry:
+; NOFUNCSPEC-NEXT:    br i1 [[C1]], label [[L1:%.*]], label [[L9:%.*]]
+; NOFUNCSPEC:       l1:
+; NOFUNCSPEC-NEXT:    [[PHI1:%.*]] = phi i64 [ [[N]], [[ENTRY:%.*]] ], [ [[PHI2:%.*]], [[L2:%.*]] ]
+; NOFUNCSPEC-NEXT:    [[ADD:%.*]] = add nuw nsw i64 [[PHI1]], 1
+; NOFUNCSPEC-NEXT:    br i1 [[C2]], label [[L1_5:%.*]], label [[EXIT:%.*]]
+; NOFUNCSPEC:       l1_5:
+; NOFUNCSPEC-NEXT:    br i1 [[C3]], label [[L1_75:%.*]], label [[L6:%.*]]
+; NOFUNCSPEC:       l1_75:
+; NOFUNCSPEC-NEXT:    br i1 [[C4]], label [[L2]], label [[L3:%.*]]
+; NOFUNCSPEC:       l2:
+; NOFUNCSPEC-NEXT:    [[PHI2]] = phi i64 [ [[PHI1]], [[L1_75]] ], [ [[PHI3:%.*]], [[L3]] ]
+; NOFUNCSPEC-NEXT:    br label [[L1]]
+; NOFUNCSPEC:       l3:
+; NOFUNCSPEC-NEXT:    [[PHI3]] = phi i64 [ [[PHI1]], [[L1_75]] ], [ [[PHI4:%.*]], [[L4:%.*]] ]
+; NOFUNCSPEC-NEXT:    br label [[L2]]
+; NOFUNCSPEC:       l4:
+; NOFUNCSPEC-NEXT:    [[PHI4]] = phi i64 [ [[PHI5:%.*]], [[L5:%.*]] ], [ [[PHI6:%.*]], [[L6]] ]
+; NOFUNCSPEC-NEXT:    br i1 [[C5]], label [[L3]], label [[L6]]
+; NOFUNCSPEC:       l5:
+; NOFUNCSPEC-NEXT:    [[PHI5]] = phi i64 [ [[PHI6]], [[L6_5:%.*]] ], [ [[PHI7:%.*]], [[L7:%.*]] ]
+; NOFUNCSPEC-NEXT:    br label [[L4]]
+; NOFUNCSPEC:       l6:
+; NOFUNCSPEC-NEXT:    [[PHI6]] = phi i64 [ [[PHI4]], [[L4]] ], [ [[PHI1]], [[L1_5]] ]
+; NOFUNCSPEC-NEXT:    br i1 [[C6]], label [[L4]], label [[L6_5]]
+; NOFUNCSPEC:       l6_5:
+; NOFUNCSPEC-NEXT:    br i1 [[C7]], label [[L5]], label [[L8:%.*]]
+; NOFUNCSPEC:       l7:
+; NOFUNCSPEC-NEXT:    [[PHI7]] = phi i64 [ [[PHI9:%.*]], [[L9]] ], [ [[PHI8:%.*]], [[L8]] ]
+; NOFUNCSPEC-NEXT:    br i1 [[C8]], label [[L5]], label [[L8]]
+; NOFUNCSPEC:       l8:
+; NOFUNCSPEC-NEXT:    [[PHI8]] = phi i64 [ [[PHI6]], [[L6_5]] ], [ [[PHI7]], [[L7]] ]
+; NOFUNCSPEC-NEXT:    br i1 [[C9]], label [[L7]], label [[L9]]
+; NOFUNCSPEC:       l9:
+; NOFUNCSPEC-NEXT:    [[PHI9]] = phi i64 [ [[N]], [[ENTRY]] ], [ [[PHI8]], [[L8]] ]
+; NOFUNCSPEC-NEXT:    [[SUB:%.*]] = sub nuw nsw i64 [[PHI9]], 1
+; NOFUNCSPEC-NEXT:    [[MUL:%.*]] = mul nuw nsw i64 [[SUB]], 2
+; NOFUNCSPEC-NEXT:    br i1 [[C10]], label [[L7]], label [[EXIT]]
+; NOFUNCSPEC:       exit:
+; NOFUNCSPEC-NEXT:    [[RES:%.*]] = phi i64 [ 2, [[L1]] ], [ [[MUL]], [[L9]] ]
+; NOFUNCSPEC-NEXT:    ret i64 [[RES]]
+;
 entry:
   br i1 %c1, label %l1, label %l9
 
diff --git a/llvm/test/Transforms/FunctionSpecialization/global-var-constants.ll b/llvm/test/Transforms/FunctionSpecialization/global-var-constants.ll
index b9481baae60b..a576d9aa32e1 100644
--- a/llvm/test/Transforms/FunctionSpecialization/global-var-constants.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/global-var-constants.ll
@@ -49,10 +49,10 @@ entry:
 ; Check if specialisation on the address of a non-const global variable
 ; is not allowed, then it is not performed.
 
-; NO-GLOBALS-LABEL: define internal i32 @g()
+; NO-GLOBALS-LABEL: define internal range(i32 -2147483646, -2147483648) i32 @g()
 ; NO-GLOBALS: call i32 @f(ptr @G)
 
-; NO-GLOBALS-LABEL: define i32 @h0(ptr %p)
+; NO-GLOBALS-LABEL: define range(i32 -2147483646, -2147483648) i32 @h0(ptr %p)
 ; NO-GLOBALS:call i32 @g()
 
 ; NO-GLOBALS-LABEL: define i32 @h1()
@@ -64,10 +64,10 @@ entry:
 ; Check if specialisation on the address of a non-const global variable
 ; is allowed, then it is performed where possible.
 
-; GLOBALS-LABEL: define internal i32 @g()
+; GLOBALS-LABEL: define internal range(i32 -2147483646, -2147483648) i32 @g()
 ; GLOBALS: call i32 @f.specialized.2()
 
-; GLOBALS-LABEL: define i32 @h0(ptr %p)
+; GLOBALS-LABEL: define range(i32 -2147483646, -2147483648) i32 @h0(ptr %p)
 ; GLOBALS: call i32 @g()
 
 ; GLOBALS-LABEL: define i32 @h1()
diff --git a/llvm/test/Transforms/FunctionSpecialization/literal-const.ll b/llvm/test/Transforms/FunctionSpecialization/literal-const.ll
index f107ffe0ec7e..3eae3dc261fb 100644
--- a/llvm/test/Transforms/FunctionSpecialization/literal-const.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/literal-const.ll
@@ -71,10 +71,10 @@ entry:
 ; CHECK-LIT-LABEL: define i32 @f1
 ; CHECK-LIT: call i32 @neg.specialized.[[#B:]]
 
-; CHECK-LIT-LABEL: define i32 @g0
+; CHECK-LIT-LABEL: define range(i32 -2147483647, -2147483648) i32 @g0
 ; CHECK-LIT: call i32 @add.specialized.[[#C:]]
 
-; CHECK-LIT-LABEL: define i32 @g1
+; CHECK-LIT-LABEL: define range(i32 -2147483647, -2147483648) i32 @g1
 ; CHECK-LIT: call i32 @add.specialized.[[#D:]]
 
 ; CHECK-LIT-LABEL: define float @h0
diff --git a/llvm/test/Transforms/GlobalOpt/basictest.ll b/llvm/test/Transforms/GlobalOpt/basictest.ll
index 6d7fcdd96dfd..72d38a1e8845 100644
--- a/llvm/test/Transforms/GlobalOpt/basictest.ll
+++ b/llvm/test/Transforms/GlobalOpt/basictest.ll
@@ -1,9 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=globalopt -S | FileCheck %s
 
-; CHECK-NOT: global
 @X = internal global i32 4              ; <ptr> [#uses=1]
 
 define i32 @foo() {
-        %V = load i32, ptr @X               ; <i32> [#uses=1]
-        ret i32 %V
+; CHECK-LABEL: define i32 @foo() local_unnamed_addr {
+; CHECK-NEXT:    ret i32 4
+;
+  %V = load i32, ptr @X               ; <i32> [#uses=1]
+  ret i32 %V
+}
+
+@X_tls = internal thread_local global i32 13
+
+define i32 @bar() {
+; CHECK-LABEL: define i32 @bar() local_unnamed_addr {
+; CHECK-NEXT:    ret i32 13
+;
+  %p = call ptr @llvm.threadlocal.address(ptr @X_tls)
+  %v = load i32, ptr %p
+  ret i32 %v
 }
diff --git a/llvm/test/Transforms/GlobalOpt/constantfold-initializers.ll b/llvm/test/Transforms/GlobalOpt/constantfold-initializers.ll
index ca844f63937c..f82942e73d92 100644
--- a/llvm/test/Transforms/GlobalOpt/constantfold-initializers.ll
+++ b/llvm/test/Transforms/GlobalOpt/constantfold-initializers.ll
@@ -72,11 +72,12 @@ entry:
 }
 
 @threadlocalptr = global ptr null, align 4
-; CHECK: @threadlocalptr = global ptr null, align 4
+; CHECK: @threadlocalptr = local_unnamed_addr global ptr null, align 4
 @threadlocalvar = external thread_local global i32
 define internal void @test5() {
 entry:
-  store ptr @threadlocalvar, ptr @threadlocalptr, align 4
+  %p = call ptr @llvm.threadlocal.address(ptr @threadlocalvar)
+  store ptr %p, ptr @threadlocalptr, align 4
   ret void
 }
 
diff --git a/llvm/test/Transforms/GlobalOpt/stored-once-forward-value.ll b/llvm/test/Transforms/GlobalOpt/stored-once-forward-value.ll
index 7b845070bbd0..2b7ceb4169f3 100644
--- a/llvm/test/Transforms/GlobalOpt/stored-once-forward-value.ll
+++ b/llvm/test/Transforms/GlobalOpt/stored-once-forward-value.ll
@@ -39,12 +39,14 @@ define i32 @dom_arg(i32 %a) {
 
 define ptr @dom_thread_local_global() {
 ; CHECK-LABEL: @dom_thread_local_global(
-; CHECK-NEXT:    store ptr @tl, ptr @g3, align 8
+; CHECK-NEXT:    [[P:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @tl)
+; CHECK-NEXT:    store ptr [[P]], ptr @g3, align 8
 ; CHECK-NEXT:    call void @b()
 ; CHECK-NEXT:    [[R:%.*]] = load ptr, ptr @g3, align 8
 ; CHECK-NEXT:    ret ptr [[R]]
 ;
-  store ptr @tl, ptr @g3
+  %p = call ptr @llvm.threadlocal.address(ptr @tl)
+  store ptr %p, ptr @g3
   call void @b()
   %r = load ptr, ptr @g3
   ret ptr %r
diff --git a/llvm/test/Transforms/GlobalOpt/tls.ll b/llvm/test/Transforms/GlobalOpt/tls.ll
index 6ba003ff30b2..2cc2ea4e366e 100644
--- a/llvm/test/Transforms/GlobalOpt/tls.ll
+++ b/llvm/test/Transforms/GlobalOpt/tls.ll
@@ -15,14 +15,16 @@ declare void @start_thread(ptr)
 define i32 @f() {
 entry:
   ; Set @ip to point to x[1] for thread 1.
-  store ptr getelementptr inbounds ([100 x i32], ptr @x, i64 0, i64 1), ptr @ip, align 8
+  %p = call ptr @llvm.threadlocal.address(ptr @x)
+  %addr = getelementptr inbounds [100 x i32], ptr %p, i64 0, i64 1
+  store ptr %addr, ptr @ip, align 8
 
   ; Run g on a new thread.
   tail call void @start_thread(ptr @g) nounwind
   tail call void @wait() nounwind
 
   ; Reset x[1] for thread 1.
-  store i32 0, ptr getelementptr inbounds ([100 x i32], ptr @x, i64 0, i64 1), align 4
+  store i32 0, ptr %addr, align 4
 
   ; Read the value of @ip, which now points at x[1] for thread 2.
   %0 = load ptr, ptr @ip, align 8
@@ -39,10 +41,12 @@ entry:
 define internal void @g() nounwind uwtable {
 entry:
   ; Set @ip to point to x[1] for thread 2.
-  store ptr getelementptr inbounds ([100 x i32], ptr @x, i64 0, i64 1), ptr @ip, align 8
+  %p = call ptr @llvm.threadlocal.address(ptr @x)
+  %addr = getelementptr inbounds [100 x i32], ptr %p, i64 0, i64 1
+  store ptr %addr, ptr @ip, align 8
 
   ; Store 50 in x[1] for thread 2.
-  store i32 50, ptr getelementptr inbounds ([100 x i32], ptr @x, i64 0, i64 1), align 4
+  store i32 50, ptr %addr, align 4
 
   tail call void @signal() nounwind
   ret void
diff --git a/llvm/test/Transforms/IRCE/pr89959.ll b/llvm/test/Transforms/IRCE/pr89959.ll
new file mode 100644
index 000000000000..dc7c0dfbc57a
--- /dev/null
+++ b/llvm/test/Transforms/IRCE/pr89959.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=irce -S < %s 2>&1 | FileCheck %s
+
+; Make sure we don't crash.
+define void @pr89959() {
+; CHECK-LABEL: define void @pr89959() {
+; CHECK-NEXT:  top:
+; CHECK-NEXT:    br label [[L3:%.*]]
+; CHECK:       L3:
+; CHECK-NEXT:    [[VALUE_PHI:%.*]] = phi ptr [ null, [[TOP:%.*]] ], [ [[TMP0:%.*]], [[L13:%.*]] ]
+; CHECK-NEXT:    [[TMP0]] = getelementptr i8, ptr [[VALUE_PHI]], i64 8
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp ule ptr [[VALUE_PHI]], null
+; CHECK-NEXT:    br i1 [[DOTNOT]], label [[L13]], label [[L15:%.*]]
+; CHECK:       L13:
+; CHECK-NEXT:    br label [[L3]]
+; CHECK:       L15:
+; CHECK-NEXT:    ret void
+;
+top:
+  br label %L3
+
+L3:
+  %value_phi = phi ptr [ null, %top ], [ %0, %L13 ]
+  %0 = getelementptr i8, ptr %value_phi, i64 8
+  %.not = icmp ule ptr %value_phi, null
+  br i1 %.not, label %L13, label %L15
+
+L13:
+  br label %L3
+
+L15:
+  ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/array.ll b/llvm/test/Transforms/InstCombine/array.ll
index 236821d8ba4c..4f4ae17bebc5 100644
--- a/llvm/test/Transforms/InstCombine/array.ll
+++ b/llvm/test/Transforms/InstCombine/array.ll
@@ -108,3 +108,163 @@ entry:
   store i32 %b, ptr %gep
   ret void
 }
+
+define ptr @gep_inbounds_add_nsw_nonneg(ptr %ptr, i64 %a, i64 %b) {
+; CHECK-LABEL: define ptr @gep_inbounds_add_nsw_nonneg(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:    [[A_NNEG:%.*]] = icmp sgt i64 [[A]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[A_NNEG]])
+; CHECK-NEXT:    [[B_NNEG:%.*]] = icmp sgt i64 [[B]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[B_NNEG]])
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[A]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[B]]
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %a.nneg = icmp sgt i64 %a, -1
+  call void @llvm.assume(i1 %a.nneg)
+  %b.nneg = icmp sgt i64 %b, -1
+  call void @llvm.assume(i1 %b.nneg)
+  %add = add nsw i64 %a, %b
+  %gep = getelementptr inbounds i32, ptr %ptr, i64 %add
+  ret ptr %gep
+}
+
+define ptr @gep_inbounds_add_nsw_not_nonneg1(ptr %ptr, i64 %a, i64 %b) {
+; CHECK-LABEL: define ptr @gep_inbounds_add_nsw_not_nonneg1(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:    [[A_NNEG:%.*]] = icmp sgt i64 [[A]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[A_NNEG]])
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[A]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[TMP1]], i64 [[B]]
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %a.nneg = icmp sgt i64 %a, -1
+  call void @llvm.assume(i1 %a.nneg)
+  %add = add nsw i64 %a, %b
+  %gep = getelementptr inbounds i32, ptr %ptr, i64 %add
+  ret ptr %gep
+}
+
+define ptr @gep_inbounds_add_nsw_not_nonneg2(ptr %ptr, i64 %a, i64 %b) {
+; CHECK-LABEL: define ptr @gep_inbounds_add_nsw_not_nonneg2(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:    [[B_NNEG:%.*]] = icmp sgt i64 [[B]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[B_NNEG]])
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[A]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[TMP1]], i64 [[B]]
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %b.nneg = icmp sgt i64 %b, -1
+  call void @llvm.assume(i1 %b.nneg)
+  %add = add nsw i64 %a, %b
+  %gep = getelementptr inbounds i32, ptr %ptr, i64 %add
+  ret ptr %gep
+}
+
+define ptr @gep_not_inbounds_add_nsw_nonneg(ptr %ptr, i64 %a, i64 %b) {
+; CHECK-LABEL: define ptr @gep_not_inbounds_add_nsw_nonneg(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:    [[A_NNEG:%.*]] = icmp sgt i64 [[A]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[A_NNEG]])
+; CHECK-NEXT:    [[B_NNEG:%.*]] = icmp sgt i64 [[B]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[B_NNEG]])
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[A]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[TMP1]], i64 [[B]]
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %a.nneg = icmp sgt i64 %a, -1
+  call void @llvm.assume(i1 %a.nneg)
+  %b.nneg = icmp sgt i64 %b, -1
+  call void @llvm.assume(i1 %b.nneg)
+  %add = add nsw i64 %a, %b
+  %gep = getelementptr i32, ptr %ptr, i64 %add
+  ret ptr %gep
+}
+
+define ptr @gep_inbounds_add_not_nsw_nonneg(ptr %ptr, i64 %a, i64 %b) {
+; CHECK-LABEL: define ptr @gep_inbounds_add_not_nsw_nonneg(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:    [[A_NNEG:%.*]] = icmp sgt i64 [[A]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[A_NNEG]])
+; CHECK-NEXT:    [[B_NNEG:%.*]] = icmp sgt i64 [[B]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[B_NNEG]])
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[A]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[TMP1]], i64 [[B]]
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %a.nneg = icmp sgt i64 %a, -1
+  call void @llvm.assume(i1 %a.nneg)
+  %b.nneg = icmp sgt i64 %b, -1
+  call void @llvm.assume(i1 %b.nneg)
+  %add = add i64 %a, %b
+  %gep = getelementptr inbounds i32, ptr %ptr, i64 %add
+  ret ptr %gep
+}
+
+define ptr @gep_inbounds_sext_add_nonneg(ptr %ptr, i32 %a) {
+; CHECK-LABEL: define ptr @gep_inbounds_sext_add_nonneg(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[A_NNEG:%.*]] = icmp sgt i32 [[A]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[A_NNEG]])
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[A]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 40
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %a.nneg = icmp sgt i32 %a, -1
+  call void @llvm.assume(i1 %a.nneg)
+  %add = add nsw i32 %a, 10
+  %idx = sext i32 %add to i64
+  %gep = getelementptr inbounds i32, ptr %ptr, i64 %idx
+  ret ptr %gep
+}
+
+define ptr @gep_inbounds_sext_add_not_nonneg_1(ptr %ptr, i32 %a) {
+; CHECK-LABEL: define ptr @gep_inbounds_sext_add_not_nonneg_1(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[A_NNEG:%.*]] = icmp sgt i32 [[A]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[A_NNEG]])
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[A]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 -40
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %a.nneg = icmp sgt i32 %a, -1
+  call void @llvm.assume(i1 %a.nneg)
+  %add = add nsw i32 %a, -10
+  %idx = sext i32 %add to i64
+  %gep = getelementptr inbounds i32, ptr %ptr, i64 %idx
+  ret ptr %gep
+}
+
+define ptr @gep_inbounds_sext_add_not_nonneg_2(ptr %ptr, i32 %a) {
+; CHECK-LABEL: define ptr @gep_inbounds_sext_add_not_nonneg_2(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[A]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 40
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %add = add nsw i32 %a, 10
+  %idx = sext i32 %add to i64
+  %gep = getelementptr inbounds i32, ptr %ptr, i64 %idx
+  ret ptr %gep
+}
+
+define ptr @gep_not_inbounds_sext_add_nonneg(ptr %ptr, i32 %a) {
+; CHECK-LABEL: define ptr @gep_not_inbounds_sext_add_nonneg(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[A_NNEG:%.*]] = icmp sgt i32 [[A]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[A_NNEG]])
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[A]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 40
+; CHECK-NEXT:    ret ptr [[GEP]]
+;
+  %a.nneg = icmp sgt i32 %a, -1
+  call void @llvm.assume(i1 %a.nneg)
+  %add = add nsw i32 %a, 10
+  %idx = sext i32 %add to i64
+  %gep = getelementptr i32, ptr %ptr, i64 %idx
+  ret ptr %gep
+}
diff --git a/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll b/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll
index 7f616bbb2a83..a61694919ab0 100644
--- a/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll
@@ -268,3 +268,328 @@ define i1 @icmp_trunc_x_zext_y_fail_multiuse(i32 %x, i8 %y) {
   %r = icmp ule i16 %x16, %y16
   ret i1 %r
 }
+
+define i1 @trunc_unsigned_nuw(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_unsigned_nuw(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nuw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i16 %x to i8
+  %yt = trunc nuw i16 %y to i8
+  %c = icmp ult i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_unsigned_nsw(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_unsigned_nsw(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i16 %x to i8
+  %yt = trunc nsw i16 %y to i8
+  %c = icmp ult i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_unsigned_both(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_unsigned_both(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw nsw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nuw nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw nsw i16 %x to i8
+  %yt = trunc nuw nsw i16 %y to i8
+  %c = icmp ult i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_unsigned_either(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_unsigned_either(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i16 %x to i8
+  %yt = trunc nsw i16 %y to i8
+  %c = icmp ult i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_signed_nuw(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_signed_nuw(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nuw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i16 %x to i8
+  %yt = trunc nuw i16 %y to i8
+  %c = icmp slt i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_signed_nsw(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_signed_nsw(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i16 %x to i8
+  %yt = trunc nsw i16 %y to i8
+  %c = icmp slt i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_signed_both(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_signed_both(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw nsw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nuw nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw nsw i16 %x to i8
+  %yt = trunc nuw nsw i16 %y to i8
+  %c = icmp slt i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_signed_either(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_signed_either(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i16 %x to i8
+  %yt = trunc nsw i16 %y to i8
+  %c = icmp slt i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_equality_nuw(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_equality_nuw(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nuw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i16 %x to i8
+  %yt = trunc nuw i16 %y to i8
+  %c = icmp eq i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_equality_nsw(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_equality_nsw(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i16 %x to i8
+  %yt = trunc nsw i16 %y to i8
+  %c = icmp eq i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_equality_both(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_equality_both(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw nsw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nuw nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw nsw i16 %x to i8
+  %yt = trunc nuw nsw i16 %y to i8
+  %c = icmp eq i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_equality_either(i16 %x, i16 %y) {
+; CHECK-LABEL: @trunc_equality_either(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i16 [[X:%.*]] to i8
+; CHECK-NEXT:    [[YT:%.*]] = trunc nsw i16 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[XT]], [[YT]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i16 %x to i8
+  %yt = trunc nsw i16 %y to i8
+  %c = icmp eq i8 %xt, %yt
+  ret i1 %c
+}
+
+define i1 @trunc_unsigned_nuw_zext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_unsigned_nuw_zext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i32 %x to i16
+  %ye = zext i8 %y to i16
+  %c = icmp ult i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_unsigned_nuw_sext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_unsigned_nuw_sext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i32 %x to i16
+  %ye = sext i8 %y to i16
+  %c = icmp ult i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_unsigned_nsw_zext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_unsigned_nsw_zext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i32 %x to i16
+  %ye = zext i8 %y to i16
+  %c = icmp ult i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_unsigned_nsw_sext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_unsigned_nsw_sext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i32 %x to i16
+  %ye = sext i8 %y to i16
+  %c = icmp ult i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_signed_nsw_sext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_signed_nsw_sext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i32 %x to i16
+  %ye = sext i8 %y to i16
+  %c = icmp slt i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_signed_nsw_zext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_signed_nsw_zext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i32 %x to i16
+  %ye = zext i8 %y to i16
+  %c = icmp slt i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_signed_nuw_sext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_signed_nuw_sext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i32 %x to i16
+  %ye = sext i8 %y to i16
+  %c = icmp slt i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_signed_nuw_zext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_signed_nuw_zext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i32 %x to i16
+  %ye = zext i8 %y to i16
+  %c = icmp slt i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_equality_nuw_zext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_equality_nuw_zext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i32 %x to i16
+  %ye = zext i8 %y to i16
+  %c = icmp ne i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_equality_nuw_sext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_equality_nuw_sext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw i32 %x to i16
+  %ye = sext i8 %y to i16
+  %c = icmp ne i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_equality_nsw_zext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_equality_nsw_zext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = zext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i32 %x to i16
+  %ye = zext i8 %y to i16
+  %c = icmp ne i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_equality_nsw_sext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_equality_nsw_sext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nsw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nsw i32 %x to i16
+  %ye = sext i8 %y to i16
+  %c = icmp ne i16 %xt, %ye
+  ret i1 %c
+}
+
+define i1 @trunc_equality_both_sext(i32 %x, i8 %y) {
+; CHECK-LABEL: @trunc_equality_both_sext(
+; CHECK-NEXT:    [[XT:%.*]] = trunc nuw nsw i32 [[X:%.*]] to i16
+; CHECK-NEXT:    [[YE:%.*]] = sext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i16 [[XT]], [[YE]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %xt = trunc nuw nsw i32 %x to i16
+  %ye = sext i8 %y to i16
+  %c = icmp ne i16 %xt, %ye
+  ret i1 %c
+}
diff --git a/llvm/test/Transforms/InstCombine/mul.ll b/llvm/test/Transforms/InstCombine/mul.ll
index 227ca4a6d5cf..4fb3c0b1ad49 100644
--- a/llvm/test/Transforms/InstCombine/mul.ll
+++ b/llvm/test/Transforms/InstCombine/mul.ll
@@ -2061,8 +2061,8 @@ define i32 @mul_sext_icmp_with_zero(i32 %x) {
 
 define i32 @test_mul_sext_bool(i1 %x, i32 %y) {
 ; CHECK-LABEL: @test_mul_sext_bool(
-; CHECK-NEXT:    [[Y_NEG:%.*]] = sub i32 0, [[Y:%.*]]
-; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[Y_NEG]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 0, [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    ret i32 [[MUL]]
 ;
   %sext = sext i1 %x to i32
@@ -2072,8 +2072,8 @@ define i32 @test_mul_sext_bool(i1 %x, i32 %y) {
 
 define i32 @test_mul_sext_bool_nuw(i1 %x, i32 %y) {
 ; CHECK-LABEL: @test_mul_sext_bool_nuw(
-; CHECK-NEXT:    [[Y_NEG:%.*]] = sub i32 0, [[Y:%.*]]
-; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[Y_NEG]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 0, [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    ret i32 [[MUL]]
 ;
   %sext = sext i1 %x to i32
@@ -2083,8 +2083,8 @@ define i32 @test_mul_sext_bool_nuw(i1 %x, i32 %y) {
 
 define i32 @test_mul_sext_bool_nsw(i1 %x, i32 %y) {
 ; CHECK-LABEL: @test_mul_sext_bool_nsw(
-; CHECK-NEXT:    [[Y_NEG:%.*]] = sub nsw i32 0, [[Y:%.*]]
-; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[Y_NEG]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw i32 0, [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    ret i32 [[MUL]]
 ;
   %sext = sext i1 %x to i32
@@ -2094,8 +2094,8 @@ define i32 @test_mul_sext_bool_nsw(i1 %x, i32 %y) {
 
 define i32 @test_mul_sext_bool_nuw_nsw(i1 %x, i32 %y) {
 ; CHECK-LABEL: @test_mul_sext_bool_nuw_nsw(
-; CHECK-NEXT:    [[Y_NEG:%.*]] = sub nsw i32 0, [[Y:%.*]]
-; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[Y_NEG]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw i32 0, [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    ret i32 [[MUL]]
 ;
   %sext = sext i1 %x to i32
@@ -2106,8 +2106,8 @@ define i32 @test_mul_sext_bool_nuw_nsw(i1 %x, i32 %y) {
 define i32 @test_mul_sext_bool_commuted(i1 %x, i32 %y) {
 ; CHECK-LABEL: @test_mul_sext_bool_commuted(
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[Y:%.*]], -2
-; CHECK-NEXT:    [[YY_NEG1:%.*]] = add i32 [[TMP1]], 1
-; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[YY_NEG1]], i32 0
+; CHECK-NEXT:    [[YY_NEG:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[YY_NEG]], i32 0
 ; CHECK-NEXT:    ret i32 [[MUL]]
 ;
   %yy = xor i32 %y, 1
@@ -2139,3 +2139,63 @@ define i32 @test_mul_sext_multiuse(i1 %x, i32 %y) {
   %mul = mul i32 %sext, %y
   ret i32 %mul
 }
+
+define i8 @mul_nsw_nonneg(i8 %x, i8 %y) {
+; CHECK-LABEL: @mul_nsw_nonneg(
+; CHECK-NEXT:    [[X_NNEG:%.*]] = icmp sgt i8 [[X:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[X_NNEG]])
+; CHECK-NEXT:    [[Y_NNEG:%.*]] = icmp sgt i8 [[Y:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[Y_NNEG]])
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i8 [[MUL]]
+;
+  %x.nneg = icmp sge i8 %x, 0
+  call void @llvm.assume(i1 %x.nneg)
+  %y.nneg = icmp sge i8 %y, 0
+  call void @llvm.assume(i1 %y.nneg)
+  %mul = mul nsw i8 %x, %y
+  ret i8 %mul
+}
+
+define i8 @mul_nsw_not_nonneg1(i8 %x, i8 %y) {
+; CHECK-LABEL: @mul_nsw_not_nonneg1(
+; CHECK-NEXT:    [[Y_NNEG:%.*]] = icmp sgt i8 [[Y:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[Y_NNEG]])
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i8 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    ret i8 [[MUL]]
+;
+  %y.nneg = icmp sge i8 %y, 0
+  call void @llvm.assume(i1 %y.nneg)
+  %mul = mul nsw i8 %x, %y
+  ret i8 %mul
+}
+
+define i8 @mul_nsw_not_nonneg2(i8 %x, i8 %y) {
+; CHECK-LABEL: @mul_nsw_not_nonneg2(
+; CHECK-NEXT:    [[X_NNEG:%.*]] = icmp sgt i8 [[X:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[X_NNEG]])
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i8 [[X]], [[Y:%.*]]
+; CHECK-NEXT:    ret i8 [[MUL]]
+;
+  %x.nneg = icmp sge i8 %x, 0
+  call void @llvm.assume(i1 %x.nneg)
+  %mul = mul nsw i8 %x, %y
+  ret i8 %mul
+}
+
+define i8 @mul_not_nsw_nonneg(i8 %x, i8 %y) {
+; CHECK-LABEL: @mul_not_nsw_nonneg(
+; CHECK-NEXT:    [[X_NNEG:%.*]] = icmp sgt i8 [[X:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[X_NNEG]])
+; CHECK-NEXT:    [[Y_NNEG:%.*]] = icmp sgt i8 [[Y:%.*]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[Y_NNEG]])
+; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i8 [[MUL]]
+;
+  %x.nneg = icmp sge i8 %x, 0
+  call void @llvm.assume(i1 %x.nneg)
+  %y.nneg = icmp sge i8 %y, 0
+  call void @llvm.assume(i1 %y.nneg)
+  %mul = mul i8 %x, %y
+  ret i8 %mul
+}
diff --git a/llvm/test/Transforms/InstCombine/sub.ll b/llvm/test/Transforms/InstCombine/sub.ll
index 56dd9c6a8799..32ed4a787e92 100644
--- a/llvm/test/Transforms/InstCombine/sub.ll
+++ b/llvm/test/Transforms/InstCombine/sub.ll
@@ -1123,7 +1123,8 @@ define i64 @test58(ptr %foo, i64 %i, i64 %j) {
 
 define i64 @test59(ptr %foo, i64 %i) {
 ; CHECK-LABEL: @test59(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], ptr [[FOO:%.*]], i64 0, i64 42, i64 [[I:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[FOO:%.*]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[TMP1]], i64 4200
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 4200
 ; CHECK-NEXT:    store ptr [[GEP1]], ptr @dummy_global1, align 8
 ; CHECK-NEXT:    store ptr [[GEP2]], ptr @dummy_global2, align 8
@@ -1142,13 +1143,12 @@ define i64 @test59(ptr %foo, i64 %i) {
 
 define i64 @test60(ptr %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test60(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], ptr [[FOO:%.*]], i64 0, i64 [[J:%.*]], i64 [[I:%.*]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 4200
-; CHECK-NEXT:    [[CAST1:%.*]] = ptrtoint ptr [[GEP1]] to i64
-; CHECK-NEXT:    [[CAST2:%.*]] = ptrtoint ptr [[GEP2]] to i64
-; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[CAST1]], [[CAST2]]
+; CHECK-NEXT:    [[GEP1_IDX:%.*]] = mul nsw i64 [[J:%.*]], 100
+; CHECK-NEXT:    [[GEP1_OFFS:%.*]] = add nsw i64 [[GEP1_IDX]], [[I:%.*]]
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[FOO:%.*]], i64 [[GEP1_OFFS]]
+; CHECK-NEXT:    [[GEPDIFF:%.*]] = add nsw i64 [[GEP1_OFFS]], -4200
 ; CHECK-NEXT:    store ptr [[GEP1]], ptr @dummy_global1, align 8
-; CHECK-NEXT:    ret i64 [[SUB]]
+; CHECK-NEXT:    ret i64 [[GEPDIFF]]
 ;
 ; gep1 has a non-constant index and more than one uses. Shouldn't duplicate the arithmetic.
   %gep1 = getelementptr inbounds [100 x [100 x i8]], ptr %foo, i64 0, i64 %j, i64 %i
@@ -1162,13 +1162,12 @@ define i64 @test60(ptr %foo, i64 %i, i64 %j) {
 
 define i64 @test61(ptr %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test61(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[FOO:%.*]], i64 4200
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [100 x [100 x i8]], ptr [[FOO]], i64 0, i64 [[J:%.*]], i64 [[I:%.*]]
-; CHECK-NEXT:    [[CAST1:%.*]] = ptrtoint ptr [[GEP1]] to i64
-; CHECK-NEXT:    [[CAST2:%.*]] = ptrtoint ptr [[GEP2]] to i64
-; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[CAST1]], [[CAST2]]
+; CHECK-NEXT:    [[GEP2_IDX:%.*]] = mul nsw i64 [[J:%.*]], 100
+; CHECK-NEXT:    [[GEP2_OFFS:%.*]] = add nsw i64 [[GEP2_IDX]], [[I:%.*]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[FOO:%.*]], i64 [[GEP2_OFFS]]
+; CHECK-NEXT:    [[GEPDIFF:%.*]] = sub nsw i64 4200, [[GEP2_OFFS]]
 ; CHECK-NEXT:    store ptr [[GEP2]], ptr @dummy_global2, align 8
-; CHECK-NEXT:    ret i64 [[SUB]]
+; CHECK-NEXT:    ret i64 [[GEPDIFF]]
 ;
 ; gep2 has a non-constant index and more than one uses. Shouldn't duplicate the arithmetic.
   %gep1 = getelementptr inbounds [100 x [100 x i8]], ptr %foo, i64 0, i64 42, i64 0
@@ -1186,11 +1185,8 @@ define i64 @test_sub_ptradd_multiuse(ptr %p, i64 %idx1, i64 %idx2) {
 ; CHECK-LABEL: @test_sub_ptradd_multiuse(
 ; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 [[IDX1:%.*]]
 ; CHECK-NEXT:    call void @use.ptr(ptr [[P1]])
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[IDX2:%.*]]
-; CHECK-NEXT:    [[P1_INT:%.*]] = ptrtoint ptr [[P1]] to i64
-; CHECK-NEXT:    [[P2_INT:%.*]] = ptrtoint ptr [[P2]] to i64
-; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[P1_INT]], [[P2_INT]]
-; CHECK-NEXT:    ret i64 [[SUB]]
+; CHECK-NEXT:    [[GEPDIFF:%.*]] = sub nsw i64 [[IDX1]], [[IDX2:%.*]]
+; CHECK-NEXT:    ret i64 [[GEPDIFF]]
 ;
   %p1 = getelementptr inbounds i8, ptr %p, i64 %idx1
   call void @use.ptr(ptr %p1)
diff --git a/llvm/test/Transforms/InstCombine/vector-reverse.ll b/llvm/test/Transforms/InstCombine/vector-reverse.ll
index 5e6672658f9a..a1a6ee949a13 100644
--- a/llvm/test/Transforms/InstCombine/vector-reverse.ll
+++ b/llvm/test/Transforms/InstCombine/vector-reverse.ll
@@ -8,11 +8,11 @@
 define <vscale x 4 x i32> @binop_reverse(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @binop_reverse(
 ; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]])
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %add = add nsw <vscale x 4 x i32> %a.rev, %b.rev
   ret <vscale x 4 x i32> %add
 }
@@ -20,14 +20,14 @@ define <vscale x 4 x i32> @binop_reverse(<vscale x 4 x i32> %a, <vscale x 4 x i3
 ; %a.rev has multiple uses
 define <vscale x 4 x i32> @binop_reverse_1(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @binop_reverse_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    [[ADD1:%.*]] = add <vscale x 4 x i32> [[A]], [[B:%.*]]
-; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]])
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   call void @use_nxv4i32(<vscale x 4 x i32>  %a.rev)
   %add = add <vscale x 4 x i32> %a.rev, %b.rev
   ret <vscale x 4 x i32> %add
@@ -36,14 +36,14 @@ define <vscale x 4 x i32> @binop_reverse_1(<vscale x 4 x i32> %a, <vscale x 4 x
 ; %b.rev has multiple uses
 define <vscale x 4 x i32> @binop_reverse_2(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @binop_reverse_2(
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[ADD1:%.*]] = add <vscale x 4 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]])
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   call void @use_nxv4i32(<vscale x 4 x i32>  %b.rev)
   %add = add <vscale x 4 x i32> %a.rev, %b.rev
   ret <vscale x 4 x i32> %add
@@ -52,15 +52,15 @@ define <vscale x 4 x i32> @binop_reverse_2(<vscale x 4 x i32> %a, <vscale x 4 x
 ; %a.rev and %b.rev have multiple uses
 define <vscale x 4 x i32> @binop_reverse_3(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @binop_reverse_3(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[ADD:%.*]] = add <vscale x 4 x i32> [[A_REV]], [[B_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
   %add = add <vscale x 4 x i32> %a.rev, %b.rev
@@ -71,10 +71,10 @@ define <vscale x 4 x i32> @binop_reverse_3(<vscale x 4 x i32> %a, <vscale x 4 x
 define <vscale x 4 x i32> @binop_reverse_4(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: @binop_reverse_4(
 ; CHECK-NEXT:    [[MUL1:%.*]] = mul <vscale x 4 x i32> [[A:%.*]], [[A]]
-; CHECK-NEXT:    [[MUL:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[MUL1]])
+; CHECK-NEXT:    [[MUL:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[MUL1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[MUL]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %mul = mul <vscale x 4 x i32> %a.rev, %a.rev
   ret <vscale x 4 x i32> %mul
 }
@@ -82,12 +82,12 @@ define <vscale x 4 x i32> @binop_reverse_4(<vscale x 4 x i32> %a) {
 ; %a.rev used as both operands along with a third use
 define <vscale x 4 x i32> @binop_reverse_5(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: @binop_reverse_5(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    [[MUL:%.*]] = mul <vscale x 4 x i32> [[A_REV]], [[A_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[MUL]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
   %mul = mul <vscale x 4 x i32> %a.rev, %a.rev
   ret <vscale x 4 x i32> %mul
@@ -98,10 +98,10 @@ define <vscale x 4 x i32> @binop_reverse_splat_RHS(<vscale x 4 x i32> %a, i32 %b
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DIV1:%.*]] = udiv <vscale x 4 x i32> [[A:%.*]], [[B_SPLAT]]
-; CHECK-NEXT:    [[DIV:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[DIV1]])
+; CHECK-NEXT:    [[DIV:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[DIV1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[DIV]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %div = udiv <vscale x 4 x i32> %a.rev, %b.splat
@@ -111,14 +111,14 @@ define <vscale x 4 x i32> @binop_reverse_splat_RHS(<vscale x 4 x i32> %a, i32 %b
 ; %a.rev has multiple uses
 define <vscale x 4 x i32> @binop_reverse_splat_RHS_1(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-LABEL: @binop_reverse_splat_RHS_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    [[DIV:%.*]] = udiv <vscale x 4 x i32> [[A_REV]], [[B_SPLAT]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[DIV]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
@@ -131,10 +131,10 @@ define <vscale x 4 x i32> @binop_reverse_splat_LHS(<vscale x 4 x i32> %a, i32 %b
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DIV1:%.*]] = udiv <vscale x 4 x i32> [[B_SPLAT]], [[A:%.*]]
-; CHECK-NEXT:    [[DIV:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[DIV1]])
+; CHECK-NEXT:    [[DIV:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[DIV1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[DIV]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %div = udiv <vscale x 4 x i32> %b.splat, %a.rev
@@ -144,14 +144,14 @@ define <vscale x 4 x i32> @binop_reverse_splat_LHS(<vscale x 4 x i32> %a, i32 %b
 ; %a.rev has multiple uses
 define <vscale x 4 x i32> @binop_reverse_splat_LHS_1(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-LABEL: @binop_reverse_splat_LHS_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    [[DIV:%.*]] = udiv <vscale x 4 x i32> [[B_SPLAT]], [[A_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[DIV]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
@@ -161,11 +161,11 @@ define <vscale x 4 x i32> @binop_reverse_splat_LHS_1(<vscale x 4 x i32> %a, i32
 
 define <vscale x 4 x float> @unop_reverse(<vscale x 4 x float> %a) {
 ; CHECK-LABEL: @unop_reverse(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]])
 ; CHECK-NEXT:    [[NEG:%.*]] = fneg fast <vscale x 4 x float> [[A_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[NEG]]
 ;
-  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   %neg = fneg fast <vscale x 4 x float> %a.rev
   ret <vscale x 4 x float> %neg
 }
@@ -173,12 +173,12 @@ define <vscale x 4 x float> @unop_reverse(<vscale x 4 x float> %a) {
 ; %a.rev has multiple uses
 define <vscale x 4 x float> @unop_reverse_1(<vscale x 4 x float> %a) {
 ; CHECK-LABEL: @unop_reverse_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4f32(<vscale x 4 x float> [[A_REV]])
 ; CHECK-NEXT:    [[NEG:%.*]] = fneg fast <vscale x 4 x float> [[A_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[NEG]]
 ;
-  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   call void @use_nxv4f32(<vscale x 4 x float> %a.rev)
   %neg = fneg fast <vscale x 4 x float> %a.rev
   ret <vscale x 4 x float> %neg
@@ -187,11 +187,11 @@ define <vscale x 4 x float> @unop_reverse_1(<vscale x 4 x float> %a) {
 define <vscale x 4 x i1> @icmp_reverse(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @icmp_reverse(
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq <vscale x 4 x i32> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
+; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %cmp = icmp eq <vscale x 4 x i32> %a.rev, %b.rev
   ret <vscale x 4 x i1> %cmp
 }
@@ -199,14 +199,14 @@ define <vscale x 4 x i1> @icmp_reverse(<vscale x 4 x i32> %a, <vscale x 4 x i32>
 ; %a.rev has multiple uses
 define <vscale x 4 x i1> @icmp_reverse_1(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @icmp_reverse_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq <vscale x 4 x i32> [[A]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
+; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
   %cmp = icmp eq <vscale x 4 x i32> %a.rev, %b.rev
   ret <vscale x 4 x i1> %cmp
@@ -215,14 +215,14 @@ define <vscale x 4 x i1> @icmp_reverse_1(<vscale x 4 x i32> %a, <vscale x 4 x i3
 ; %b.rev has multiple uses
 define <vscale x 4 x i1> @icmp_reverse_2(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @icmp_reverse_2(
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq <vscale x 4 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
+; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
   %cmp = icmp eq <vscale x 4 x i32> %a.rev, %b.rev
   ret <vscale x 4 x i1> %cmp
@@ -231,15 +231,15 @@ define <vscale x 4 x i1> @icmp_reverse_2(<vscale x 4 x i32> %a, <vscale x 4 x i3
 ; %a.rev and %b.rev have multiple uses
 define <vscale x 4 x i1> @icmp_reverse_3(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @icmp_reverse_3(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <vscale x 4 x i32> [[A_REV]], [[B_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
   %cmp = icmp eq <vscale x 4 x i32> %a.rev, %b.rev
@@ -251,10 +251,10 @@ define <vscale x 4 x i1> @icmp_reverse_splat_RHS(<vscale x 4 x i32> %a, i32 %b)
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt <vscale x 4 x i32> [[B_SPLAT]], [[A:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
+; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %cmp = icmp sgt <vscale x 4 x i32> %a.rev, %b.splat
@@ -264,14 +264,14 @@ define <vscale x 4 x i1> @icmp_reverse_splat_RHS(<vscale x 4 x i32> %a, i32 %b)
 ; %a.rev has multiple uses
 define <vscale x 4 x i1> @icmp_reverse_splat_RHS_1(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-LABEL: @icmp_reverse_splat_RHS_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <vscale x 4 x i32> [[A_REV]], [[B_SPLAT]]
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
@@ -284,10 +284,10 @@ define <vscale x 4 x i1> @icmp_reverse_splat_LHS(<vscale x 4 x i32> %a, i32 %b)
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult <vscale x 4 x i32> [[B_SPLAT]], [[A:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
+; CHECK-NEXT:    [[CMP:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[CMP1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %cmp = icmp ult <vscale x 4 x i32> %b.splat, %a.rev
@@ -297,14 +297,14 @@ define <vscale x 4 x i1> @icmp_reverse_splat_LHS(<vscale x 4 x i32> %a, i32 %b)
 ; %a.rev has multiple uses
 define <vscale x 4 x i1> @icmp_reverse_splat_LHS_1(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-LABEL: @icmp_reverse_splat_LHS_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[B_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <vscale x 4 x i32> [[B_SPLAT]], [[A_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP]]
 ;
-  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.insert = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %b.splat = shufflevector <vscale x 4 x i32> %b.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i32(<vscale x 4 x i32> %a.rev)
@@ -315,12 +315,12 @@ define <vscale x 4 x i1> @icmp_reverse_splat_LHS_1(<vscale x 4 x i32> %a, i32 %b
 define <vscale x 4 x i32> @select_reverse(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse(
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.rev
   ret <vscale x 4 x i32> %select
 }
@@ -328,15 +328,15 @@ define <vscale x 4 x i32> @select_reverse(<vscale x 4 x i1> %a, <vscale x 4 x i3
 ; %a.rev has multiple uses
 define <vscale x 4 x i32> @select_reverse_1(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.rev
   ret <vscale x 4 x i32> %select
@@ -345,15 +345,15 @@ define <vscale x 4 x i32> @select_reverse_1(<vscale x 4 x i1> %a, <vscale x 4 x
 ; %b.rev has multiple uses
 define <vscale x 4 x i32> @select_reverse_2(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse_2(
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C:%.*]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.rev
   ret <vscale x 4 x i32> %select
@@ -362,15 +362,15 @@ define <vscale x 4 x i32> @select_reverse_2(<vscale x 4 x i1> %a, <vscale x 4 x
 ; %c.rev has multiple uses
 define <vscale x 4 x i32> @select_reverse_3(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse_3(
-; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
+; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[C_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   call void @use_nxv4i32(<vscale x 4 x i32> %c.rev)
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.rev
   ret <vscale x 4 x i32> %select
@@ -379,17 +379,17 @@ define <vscale x 4 x i32> @select_reverse_3(<vscale x 4 x i1> %a, <vscale x 4 x
 ; %a.rev and %b.rev have multiple uses
 define <vscale x 4 x i32> @select_reverse_4(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse_4(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C:%.*]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.rev
@@ -399,17 +399,17 @@ define <vscale x 4 x i32> @select_reverse_4(<vscale x 4 x i1> %a, <vscale x 4 x
 ; %a.rev and %c.rev have multiple uses
 define <vscale x 4 x i32> @select_reverse_5(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse_5(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
-; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[C_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
   call void @use_nxv4i32(<vscale x 4 x i32> %c.rev)
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.rev
@@ -419,17 +419,17 @@ define <vscale x 4 x i32> @select_reverse_5(<vscale x 4 x i1> %a, <vscale x 4 x
 ; %b.rev and %c.rev have multiple uses
 define <vscale x 4 x i32> @select_reverse_6(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse_6(
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
-; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[C_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
   call void @use_nxv4i32(<vscale x 4 x i32> %c.rev)
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.rev
@@ -439,18 +439,18 @@ define <vscale x 4 x i32> @select_reverse_6(<vscale x 4 x i1> %a, <vscale x 4 x
 ; %a.rev, %b.rev and %c.rev have multiple uses
 define <vscale x 4 x i32> @select_reverse_7(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
 ; CHECK-LABEL: @select_reverse_7(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
-; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[C_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[C:%.*]])
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[C_REV]])
 ; CHECK-NEXT:    [[SELECT:%.*]] = select <vscale x 4 x i1> [[A_REV]], <vscale x 4 x i32> [[B_REV]], <vscale x 4 x i32> [[C_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
-  %c.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %c.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %c)
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
   call void @use_nxv4i32(<vscale x 4 x i32> %c.rev)
@@ -463,11 +463,11 @@ define <vscale x 4 x i32> @select_reverse_splat_false(<vscale x 4 x i1> %a, <vsc
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C_SPLAT]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %b.rev, <vscale x 4 x i32> %c.splat
@@ -477,16 +477,16 @@ define <vscale x 4 x i32> @select_reverse_splat_false(<vscale x 4 x i1> %a, <vsc
 ; %a.rev has multiple uses
 define <vscale x 4 x i32> @select_reverse_splat_false_1(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, i32 %c) {
 ; CHECK-LABEL: @select_reverse_splat_false_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C_SPLAT]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
@@ -497,16 +497,16 @@ define <vscale x 4 x i32> @select_reverse_splat_false_1(<vscale x 4 x i1> %a, <v
 ; %b.rev has multiple uses
 define <vscale x 4 x i32> @select_reverse_splat_false_2(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, i32 %c) {
 ; CHECK-LABEL: @select_reverse_splat_false_2(
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C_SPLAT]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
@@ -517,8 +517,8 @@ define <vscale x 4 x i32> @select_reverse_splat_false_2(<vscale x 4 x i1> %a, <v
 ; %a.rev and %b.rev have multiple uses
 define <vscale x 4 x i32> @select_reverse_splat_false_3(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, i32 %c) {
 ; CHECK-LABEL: @select_reverse_splat_false_3(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
@@ -526,8 +526,8 @@ define <vscale x 4 x i32> @select_reverse_splat_false_3(<vscale x 4 x i1> %a, <v
 ; CHECK-NEXT:    [[SELECT:%.*]] = select <vscale x 4 x i1> [[A_REV]], <vscale x 4 x i32> [[B_REV]], <vscale x 4 x i32> [[C_SPLAT]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
@@ -541,11 +541,11 @@ define <vscale x 4 x i32> @select_reverse_splat_true(<vscale x 4 x i1> %a, <vsca
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[C_SPLAT]], <vscale x 4 x i32> [[B:%.*]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   %select = select <vscale x 4 x i1> %a.rev, <vscale x 4 x i32> %c.splat, <vscale x 4 x i32> %b.rev
@@ -555,16 +555,16 @@ define <vscale x 4 x i32> @select_reverse_splat_true(<vscale x 4 x i1> %a, <vsca
 ; %a.rev has multiple uses
 define <vscale x 4 x i32> @select_reverse_splat_true_1(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, i32 %c) {
 ; CHECK-LABEL: @select_reverse_splat_true_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A]], <vscale x 4 x i32> [[C_SPLAT]], <vscale x 4 x i32> [[B:%.*]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
@@ -575,16 +575,16 @@ define <vscale x 4 x i32> @select_reverse_splat_true_1(<vscale x 4 x i1> %a, <vs
 ; %b.rev has multiple uses
 define <vscale x 4 x i32> @select_reverse_splat_true_2(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, i32 %c) {
 ; CHECK-LABEL: @select_reverse_splat_true_2(
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x i32> [[C_SPLAT]], <vscale x 4 x i32> [[B]]
-; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
+; CHECK-NEXT:    [[SELECT:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[SELECT1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i32(<vscale x 4 x i32> %b.rev)
@@ -595,8 +595,8 @@ define <vscale x 4 x i32> @select_reverse_splat_true_2(<vscale x 4 x i1> %a, <vs
 ; %a.rev and %b.rev have multiple uses
 define <vscale x 4 x i32> @select_reverse_splat_true_3(<vscale x 4 x i1> %a, <vscale x 4 x i32> %b, i32 %c) {
 ; CHECK-LABEL: @select_reverse_splat_true_3(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[A:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
 ; CHECK-NEXT:    [[C_INSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[C_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[C_INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @use_nxv4i1(<vscale x 4 x i1> [[A_REV]])
@@ -604,8 +604,8 @@ define <vscale x 4 x i32> @select_reverse_splat_true_3(<vscale x 4 x i1> %a, <vs
 ; CHECK-NEXT:    [[SELECT:%.*]] = select <vscale x 4 x i1> [[A_REV]], <vscale x 4 x i32> [[C_SPLAT]], <vscale x 4 x i32> [[B_REV]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SELECT]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
   %c.insert = insertelement <vscale x 4 x i32> poison, i32 %c, i32 0
   %c.splat = shufflevector <vscale x 4 x i32> %c.insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
   call void @use_nxv4i1(<vscale x 4 x i1> %a.rev)
@@ -622,10 +622,10 @@ define <vscale x 4 x float> @reverse_binop_reverse(<vscale x 4 x float> %a, <vsc
 ; CHECK-NEXT:    [[ADD1:%.*]] = fadd <vscale x 4 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[ADD1]]
 ;
-  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
-  %b.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %b)
+  %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %b.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %b)
   %add = fadd <vscale x 4 x float> %a.rev, %b.rev
-  %add.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %add)
+  %add.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %add)
   ret <vscale x 4 x float> %add.rev
 }
 
@@ -636,11 +636,11 @@ define <vscale x 4 x float> @reverse_binop_reverse_splat_RHS(<vscale x 4 x float
 ; CHECK-NEXT:    [[DIV1:%.*]] = fdiv <vscale x 4 x float> [[A:%.*]], [[B_SPLAT]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[DIV1]]
 ;
-  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   %b.insert = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %b.splat = shufflevector <vscale x 4 x float> %b.insert, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
   %div = fdiv <vscale x 4 x float> %a.rev, %b.splat
-  %div.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %div)
+  %div.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %div)
   ret <vscale x 4 x float> %div.rev
 }
 
@@ -651,11 +651,11 @@ define <vscale x 4 x float> @reverse_binop_reverse_splat_LHS(<vscale x 4 x float
 ; CHECK-NEXT:    [[DIV1:%.*]] = fdiv <vscale x 4 x float> [[B_SPLAT]], [[A:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[DIV1]]
 ;
-  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   %b.insert = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %b.splat = shufflevector <vscale x 4 x float> %b.insert, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
   %div = fdiv <vscale x 4 x float> %b.splat, %a.rev
-  %div.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %div)
+  %div.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %div)
   ret <vscale x 4 x float> %div.rev
 }
 
@@ -664,10 +664,10 @@ define <vscale x 4 x i1> @reverse_fcmp_reverse(<vscale x 4 x float> %a, <vscale
 ; CHECK-NEXT:    [[CMP1:%.*]] = fcmp fast olt <vscale x 4 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x i1> [[CMP1]]
 ;
-  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
-  %b.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %b)
+  %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %b.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %b)
   %cmp = fcmp fast olt <vscale x 4 x float> %a.rev, %b.rev
-  %cmp.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %cmp)
+  %cmp.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %cmp)
   ret <vscale x 4 x i1> %cmp.rev
 }
 
@@ -676,11 +676,11 @@ define <vscale x 4 x float> @reverse_select_reverse(<vscale x 4 x i1> %a, <vscal
 ; CHECK-NEXT:    [[SELECT1:%.*]] = select fast <vscale x 4 x i1> [[A:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> [[C:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[SELECT1]]
 ;
-  %a.rev = tail call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
-  %b.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %b)
-  %c.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %c)
+  %a.rev = tail call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  %b.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %b)
+  %c.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %c)
   %select = select fast <vscale x 4 x i1> %a.rev, <vscale x 4 x float> %b.rev, <vscale x 4 x float> %c.rev
-  %select.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %select)
+  %select.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %select)
   ret <vscale x 4 x float> %select.rev
 }
 
@@ -689,9 +689,9 @@ define <vscale x 4 x float> @reverse_unop_reverse(<vscale x 4 x float> %a) {
 ; CHECK-NEXT:    [[NEG1:%.*]] = fneg <vscale x 4 x float> [[A:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[NEG1]]
 ;
-  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
+  %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   %neg = fneg <vscale x 4 x float> %a.rev
-  %neg.rev = tail call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %neg)
+  %neg.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %neg)
   ret <vscale x 4 x float> %neg.rev
 }
 
@@ -700,6 +700,6 @@ declare void @use_nxv4i1(<vscale x 4 x i1>)
 declare void @use_nxv4i32(<vscale x 4 x i32>)
 declare void @use_nxv4f32(<vscale x 4 x float>)
 
-declare <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32>)
-declare <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float>)
diff --git a/llvm/test/Transforms/InstSimplify/named-vector-shuffle-reverse.ll b/llvm/test/Transforms/InstSimplify/named-vector-shuffle-reverse.ll
index a26f0a9d87f8..25e99ff0e715 100644
--- a/llvm/test/Transforms/InstSimplify/named-vector-shuffle-reverse.ll
+++ b/llvm/test/Transforms/InstSimplify/named-vector-shuffle-reverse.ll
@@ -6,8 +6,8 @@ define <vscale x 4 x i32> @shuffle_b2b_reverse(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: @shuffle_b2b_reverse(
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[A:%.*]]
 ;
-  %rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
-  %rev.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %rev)
+  %rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %rev.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %rev)
   ret <vscale x 4 x i32> %rev.rev
 }
 
@@ -20,8 +20,8 @@ define <vscale x 4 x i32> @splat_reverse(i32 %a) {
 ;
   %splat_insert = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
   %splat = shufflevector <vscale x 4 x i32> %splat_insert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-  %rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %splat)
+  %rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %splat)
   ret <vscale x 4 x i32> %rev
 }
 
-declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32>)
diff --git a/llvm/test/Transforms/InstSimplify/select.ll b/llvm/test/Transforms/InstSimplify/select.ll
index 40c1460e3ebc..4eb6491eec5a 100644
--- a/llvm/test/Transforms/InstSimplify/select.ll
+++ b/llvm/test/Transforms/InstSimplify/select.ll
@@ -1105,19 +1105,19 @@ define <2 x i32> @select_ctpop_zero_vec(<2 x i32> %x) {
 define <2 x i32> @select_vector_reverse(<2 x i32> %x) {
 ; CHECK-LABEL: @select_vector_reverse(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i32> [[X:%.*]], zeroinitializer
-; CHECK-NEXT:    [[REV:%.*]] = call <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32> [[X]])
+; CHECK-NEXT:    [[REV:%.*]] = call <2 x i32> @llvm.vector.reverse.v2i32(<2 x i32> [[X]])
 ; CHECK-NEXT:    [[SEL:%.*]] = select <2 x i1> [[CMP]], <2 x i32> zeroinitializer, <2 x i32> [[REV]]
 ; CHECK-NEXT:    ret <2 x i32> [[SEL]]
 ;
   %cmp = icmp eq <2 x i32> %x, zeroinitializer
-  %rev = call <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32> %x)
+  %rev = call <2 x i32> @llvm.vector.reverse.v2i32(<2 x i32> %x)
   %sel = select <2 x i1> %cmp, <2 x i32> zeroinitializer, <2 x i32> %rev
   ret <2 x i32> %sel
 }
 
 declare i32 @llvm.ctpop.i32(i32)
 declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>)
-declare <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32>)
+declare <2 x i32> @llvm.vector.reverse.v2i32(<2 x i32>)
 
 define <2 x i32> @vec_select_no_equivalence(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @vec_select_no_equivalence(
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
index 224a0693bf21..54348d1e2a48 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
@@ -15,11 +15,11 @@ define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2(ptr %ptr) {
 ; SVE-FIXED-LABEL: define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <32 x i8>, ptr [[PTR]], align 1
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <16 x i8>, <16 x i8> } [[DEINTERLEAVE]]
 ;
   %load = load <32 x i8>, ptr %ptr, align 1
-  %deinterleave = tail call { <16 x i8>, <16 x i8> } @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %load)
+  %deinterleave = tail call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %load)
   ret { <16 x i8>, <16 x i8> } %deinterleave
 }
 
@@ -32,11 +32,11 @@ define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2(ptr %ptr) {
 ; SVE-FIXED-LABEL: define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <16 x i16>, ptr [[PTR]], align 2
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <8 x i16>, <8 x i16> } @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <8 x i16>, <8 x i16> } [[DEINTERLEAVE]]
 ;
   %load = load <16 x i16>, ptr %ptr, align 2
-  %deinterleave = tail call { <8 x i16>, <8 x i16> } @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %load)
+  %deinterleave = tail call { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16> %load)
   ret { <8 x i16>, <8 x i16> } %deinterleave
 }
 
@@ -49,11 +49,11 @@ define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2(ptr %ptr) {
 ; SVE-FIXED-LABEL: define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <8 x i32>, ptr [[PTR]], align 4
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <4 x i32>, <4 x i32> } [[DEINTERLEAVE]]
 ;
   %load = load <8 x i32>, ptr %ptr, align 4
-  %deinterleave = tail call { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %load)
+  %deinterleave = tail call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %load)
   ret { <4 x i32>, <4 x i32> } %deinterleave
 }
 
@@ -66,11 +66,11 @@ define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2(ptr %ptr) {
 ; SVE-FIXED-LABEL: define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <4 x i64>, ptr [[PTR]], align 8
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x i64>, <2 x i64> } @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x i64>, <2 x i64> } @llvm.vector.deinterleave2.v4i64(<4 x i64> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <2 x i64>, <2 x i64> } [[DEINTERLEAVE]]
 ;
   %load = load <4 x i64>, ptr %ptr, align 8
-  %deinterleave = tail call { <2 x i64>, <2 x i64> } @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %load)
+  %deinterleave = tail call { <2 x i64>, <2 x i64> } @llvm.vector.deinterleave2.v4i64(<4 x i64> %load)
   ret { <2 x i64>, <2 x i64> } %deinterleave
 }
 
@@ -83,11 +83,11 @@ define { <4 x float>, <4 x float> } @deinterleave_float_factor2(ptr %ptr) {
 ; SVE-FIXED-LABEL: define { <4 x float>, <4 x float> } @deinterleave_float_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <8 x float>, ptr [[PTR]], align 4
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <4 x float>, <4 x float> } @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <4 x float>, <4 x float> } @llvm.vector.deinterleave2.v8f32(<8 x float> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <4 x float>, <4 x float> } [[DEINTERLEAVE]]
 ;
   %load = load <8 x float>, ptr %ptr, align 4
-  %deinterleave = tail call { <4 x float>, <4 x float> } @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %load)
+  %deinterleave = tail call { <4 x float>, <4 x float> } @llvm.vector.deinterleave2.v8f32(<8 x float> %load)
   ret { <4 x float>, <4 x float> } %deinterleave
 }
 
@@ -100,11 +100,11 @@ define { <2 x double>, <2 x double> } @deinterleave_double_factor2(ptr %ptr) {
 ; SVE-FIXED-LABEL: define { <2 x double>, <2 x double> } @deinterleave_double_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <4 x double>, ptr [[PTR]], align 8
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x double>, <2 x double> } @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <2 x double>, <2 x double> } [[DEINTERLEAVE]]
 ;
   %load = load <4 x double>, ptr %ptr, align 8
-  %deinterleave = tail call { <2 x double>, <2 x double> } @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %load)
+  %deinterleave = tail call { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double> %load)
   ret { <2 x double>, <2 x double> } %deinterleave
 }
 
@@ -117,11 +117,11 @@ define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2(ptr %ptr) {
 ; SVE-FIXED-LABEL: define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <4 x ptr>, ptr [[PTR]], align 8
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x ptr>, <2 x ptr> } @llvm.experimental.vector.deinterleave2.v4p0(<4 x ptr> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <2 x ptr>, <2 x ptr> } @llvm.vector.deinterleave2.v4p0(<4 x ptr> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <2 x ptr>, <2 x ptr> } [[DEINTERLEAVE]]
 ;
   %load = load <4 x ptr>, ptr %ptr, align 8
-  %deinterleave = tail call { <2 x ptr>, <2 x ptr> } @llvm.experimental.vector.deinterleave2.v4p0(<4 x ptr> %load)
+  %deinterleave = tail call { <2 x ptr>, <2 x ptr> } @llvm.vector.deinterleave2.v4p0(<4 x ptr> %load)
   ret { <2 x ptr>, <2 x ptr> } %deinterleave
 }
 
@@ -133,11 +133,11 @@ define void @interleave_i8_factor2(ptr %ptr, <16 x i8> %l, <16 x i8> %r) {
 ;
 ; SVE-FIXED-LABEL: define void @interleave_i8_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <16 x i8> [[L:%.*]], <16 x i8> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8> [[L]], <16 x i8> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <32 x i8> @llvm.vector.interleave2.v32i8(<16 x i8> [[L]], <16 x i8> [[R]])
 ; SVE-FIXED-NEXT:    store <32 x i8> [[INTERLEAVE]], ptr [[PTR]], align 1
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8> %l, <16 x i8> %r)
+  %interleave = tail call <32 x i8> @llvm.vector.interleave2.v32i8(<16 x i8> %l, <16 x i8> %r)
   store <32 x i8> %interleave, ptr %ptr, align 1
   ret void
 }
@@ -150,11 +150,11 @@ define void @interleave_i16_factor2(ptr %ptr, <8 x i16> %l, <8 x i16> %r) {
 ;
 ; SVE-FIXED-LABEL: define void @interleave_i16_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <8 x i16> [[L:%.*]], <8 x i16> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> [[L]], <8 x i16> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16> [[L]], <8 x i16> [[R]])
 ; SVE-FIXED-NEXT:    store <16 x i16> [[INTERLEAVE]], ptr [[PTR]], align 2
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %l, <8 x i16> %r)
+  %interleave = tail call <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16> %l, <8 x i16> %r)
   store <16 x i16> %interleave, ptr %ptr, align 2
   ret void
 }
@@ -167,11 +167,11 @@ define void @interleave_i32_factor2(ptr %ptr, <4 x i32> %l, <4 x i32> %r) {
 ;
 ; SVE-FIXED-LABEL: define void @interleave_i32_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <4 x i32> [[L:%.*]], <4 x i32> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> [[L]], <4 x i32> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> [[L]], <4 x i32> [[R]])
 ; SVE-FIXED-NEXT:    store <8 x i32> [[INTERLEAVE]], ptr [[PTR]], align 4
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %l, <4 x i32> %r)
+  %interleave = tail call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %l, <4 x i32> %r)
   store <8 x i32> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -184,11 +184,11 @@ define void @interleave_i64_factor2(ptr %ptr, <2 x i64> %l, <2 x i64> %r) {
 ;
 ; SVE-FIXED-LABEL: define void @interleave_i64_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <2 x i64> [[L:%.*]], <2 x i64> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> [[L]], <2 x i64> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64> [[L]], <2 x i64> [[R]])
 ; SVE-FIXED-NEXT:    store <4 x i64> [[INTERLEAVE]], ptr [[PTR]], align 8
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %l, <2 x i64> %r)
+  %interleave = tail call <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64> %l, <2 x i64> %r)
   store <4 x i64> %interleave, ptr %ptr, align 8
   ret void
 }
@@ -201,11 +201,11 @@ define void @interleave_float_factor2(ptr %ptr, <4 x float> %l, <4 x float> %r)
 ;
 ; SVE-FIXED-LABEL: define void @interleave_float_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <4 x float> [[L:%.*]], <4 x float> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> [[L]], <4 x float> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <8 x float> @llvm.vector.interleave2.v8f32(<4 x float> [[L]], <4 x float> [[R]])
 ; SVE-FIXED-NEXT:    store <8 x float> [[INTERLEAVE]], ptr [[PTR]], align 4
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %l, <4 x float> %r)
+  %interleave = tail call <8 x float> @llvm.vector.interleave2.v8f32(<4 x float> %l, <4 x float> %r)
   store <8 x float> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -218,11 +218,11 @@ define void @interleave_double_factor2(ptr %ptr, <2 x double> %l, <2 x double> %
 ;
 ; SVE-FIXED-LABEL: define void @interleave_double_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <2 x double> [[L:%.*]], <2 x double> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> [[L]], <2 x double> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <4 x double> @llvm.vector.interleave2.v4f64(<2 x double> [[L]], <2 x double> [[R]])
 ; SVE-FIXED-NEXT:    store <4 x double> [[INTERLEAVE]], ptr [[PTR]], align 4
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %l, <2 x double> %r)
+  %interleave = tail call <4 x double> @llvm.vector.interleave2.v4f64(<2 x double> %l, <2 x double> %r)
   store <4 x double> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -235,11 +235,11 @@ define void @interleave_ptr_factor2(ptr %ptr, <2 x ptr> %l, <2 x ptr> %r) {
 ;
 ; SVE-FIXED-LABEL: define void @interleave_ptr_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <2 x ptr> [[L:%.*]], <2 x ptr> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <4 x ptr> @llvm.experimental.vector.interleave2.v4p0(<2 x ptr> [[L]], <2 x ptr> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <4 x ptr> @llvm.vector.interleave2.v4p0(<2 x ptr> [[L]], <2 x ptr> [[R]])
 ; SVE-FIXED-NEXT:    store <4 x ptr> [[INTERLEAVE]], ptr [[PTR]], align 4
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <4 x ptr> @llvm.experimental.vector.interleave2.v4p0(<2 x ptr> %l, <2 x ptr> %r)
+  %interleave = tail call <4 x ptr> @llvm.vector.interleave2.v4p0(<2 x ptr> %l, <2 x ptr> %r)
   store <4 x ptr> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -266,11 +266,11 @@ define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2(ptr %ptr) #0 {
 ; SVE-FIXED-LABEL: define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; SVE-FIXED-NEXT:    [[LOAD:%.*]] = load <32 x i16>, ptr [[PTR]], align 2
-; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <16 x i16>, <16 x i16> } @llvm.experimental.vector.deinterleave2.v32i16(<32 x i16> [[LOAD]])
+; SVE-FIXED-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <16 x i16>, <16 x i16> } @llvm.vector.deinterleave2.v32i16(<32 x i16> [[LOAD]])
 ; SVE-FIXED-NEXT:    ret { <16 x i16>, <16 x i16> } [[DEINTERLEAVE]]
 ;
   %load = load <32 x i16>, ptr %ptr, align 2
-  %deinterleave = tail call { <16 x i16>, <16 x i16> } @llvm.experimental.vector.deinterleave2.v32i16(<32 x i16> %load)
+  %deinterleave = tail call { <16 x i16>, <16 x i16> } @llvm.vector.deinterleave2.v32i16(<32 x i16> %load)
   ret { <16 x i16>, <16 x i16> } %deinterleave
 }
 
@@ -297,29 +297,29 @@ define void @interleave_wide_ptr_factor2(ptr %ptr, <8 x ptr> %l, <8 x ptr> %r) {
 ;
 ; SVE-FIXED-LABEL: define void @interleave_wide_ptr_factor2
 ; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <8 x ptr> [[L:%.*]], <8 x ptr> [[R:%.*]]) #[[ATTR0]] {
-; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <16 x ptr> @llvm.experimental.vector.interleave2.v16p0(<8 x ptr> [[L]], <8 x ptr> [[R]])
+; SVE-FIXED-NEXT:    [[INTERLEAVE:%.*]] = tail call <16 x ptr> @llvm.vector.interleave2.v16p0(<8 x ptr> [[L]], <8 x ptr> [[R]])
 ; SVE-FIXED-NEXT:    store <16 x ptr> [[INTERLEAVE]], ptr [[PTR]], align 4
 ; SVE-FIXED-NEXT:    ret void
 ;
-  %interleave = tail call <16 x ptr> @llvm.experimental.vector.interleave2.v16p0(<8 x ptr> %l, <8 x ptr> %r)
+  %interleave = tail call <16 x ptr> @llvm.vector.interleave2.v16p0(<8 x ptr> %l, <8 x ptr> %r)
   store <16 x ptr> %interleave, ptr %ptr, align 4
   ret void
 }
 
-declare { <16 x i8>, <16 x i8> } @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>)
-declare { <8 x i16>, <8 x i16> } @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>)
-declare { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>)
-declare { <2 x i64>, <2 x i64> } @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>)
-declare { <4 x float>, <4 x float> } @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>)
-declare { <2 x double>, <2 x double> } @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>)
-declare { <2 x ptr>, <2 x ptr> } @llvm.experimental.vector.deinterleave2.v4p0(<4 x ptr>)
-declare { <16 x i16>, <16 x i16> } @llvm.experimental.vector.deinterleave2.v32i16(<32 x i16>)
+declare { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8>)
+declare { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16>)
+declare { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32>)
+declare { <2 x i64>, <2 x i64> } @llvm.vector.deinterleave2.v4i64(<4 x i64>)
+declare { <4 x float>, <4 x float> } @llvm.vector.deinterleave2.v8f32(<8 x float>)
+declare { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double>)
+declare { <2 x ptr>, <2 x ptr> } @llvm.vector.deinterleave2.v4p0(<4 x ptr>)
+declare { <16 x i16>, <16 x i16> } @llvm.vector.deinterleave2.v32i16(<32 x i16>)
 
-declare <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8>, <16 x i8>)
-declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
-declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
-declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
-declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>)
-declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>)
-declare <4 x ptr> @llvm.experimental.vector.interleave2.v4p0(<2 x ptr>, <2 x ptr>)
-declare <16 x ptr> @llvm.experimental.vector.interleave2.v16p0(<8 x ptr>, <8 x ptr>)
+declare <32 x i8> @llvm.vector.interleave2.v32i8(<16 x i8>, <16 x i8>)
+declare <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
+declare <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
+declare <8 x float> @llvm.vector.interleave2.v8f32(<4 x float>, <4 x float>)
+declare <4 x double> @llvm.vector.interleave2.v4f64(<2 x double>, <2 x double>)
+declare <4 x ptr> @llvm.vector.interleave2.v4p0(<2 x ptr>, <2 x ptr>)
+declare <16 x ptr> @llvm.vector.interleave2.v16p0(<8 x ptr>, <8 x ptr>)
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
index 6353bf10d57c..2a05718cc416 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
@@ -11,7 +11,7 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8> } @deinterleave_nxi8_factor2(ptr
 ; CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]]
 ;
   %load = load <vscale x 32 x i8>, ptr %ptr, align 1
-  %deinterleave = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %load)
+  %deinterleave = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %load)
   ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleave
 }
 
@@ -22,7 +22,7 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16> } @deinterleave_nxi16_factor2(pt
 ; CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]]
 ;
   %load = load <vscale x 16 x i16>, ptr %ptr, align 2
-  %deinterleave = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %load)
+  %deinterleave = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %load)
   ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %deinterleave
 }
 
@@ -33,7 +33,7 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32> } @deinterleave_nx8xi32_factor2(
 ; CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]]
 ;
   %load = load <vscale x 8 x i32>, ptr %ptr, align 4
-  %deinterleave = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %load)
+  %deinterleave = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %load)
   ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave
 }
 
@@ -44,7 +44,7 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64> } @deinterleave_nxi64_factor2(pt
 ; CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]]
 ;
   %load = load <vscale x 4 x i64>, ptr %ptr, align 8
-  %deinterleave = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %load)
+  %deinterleave = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %load)
   ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %deinterleave
 }
 
@@ -55,7 +55,7 @@ define { <vscale x 4 x float>, <vscale x 4 x float> } @deinterleave_nxfloat_fact
 ; CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]]
 ;
   %load = load <vscale x 8 x float>, ptr %ptr, align 4
-  %deinterleave = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %load)
+  %deinterleave = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %load)
   ret { <vscale x 4 x float>, <vscale x 4 x float> } %deinterleave
 }
 
@@ -66,7 +66,7 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @deinterleave_nxdouble_f
 ; CHECK-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]]
 ;
   %load = load <vscale x 4 x double>, ptr %ptr, align 8
-  %deinterleave = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %load)
+  %deinterleave = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %load)
   ret { <vscale x 2 x double>, <vscale x 2 x double> } %deinterleave
 }
 
@@ -77,7 +77,7 @@ define { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @deinterleave_nxptr_factor2(pt
 ; CHECK-NEXT:    ret { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]]
 ;
   %load = load <vscale x 4 x ptr>, ptr %ptr, align 8
-  %deinterleave = tail call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.experimental.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr> %load)
+  %deinterleave = tail call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr> %load)
   ret { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %deinterleave
 }
 
@@ -87,7 +87,7 @@ define void @interleave_nxi8_factor2(ptr %ptr, <vscale x 16 x i8> %l, <vscale x
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[L]], <vscale x 16 x i8> [[R]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> %l, <vscale x 16 x i8> %r)
+  %interleave = tail call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %l, <vscale x 16 x i8> %r)
   store <vscale x 32 x i8> %interleave, ptr %ptr, align 1
   ret void
 }
@@ -98,7 +98,7 @@ define void @interleave_nxi16_factor2(ptr %ptr, <vscale x 8 x i16> %l, <vscale x
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[L]], <vscale x 8 x i16> [[R]], <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16> %l, <vscale x 8 x i16> %r)
+  %interleave = tail call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %l, <vscale x 8 x i16> %r)
   store <vscale x 16 x i16> %interleave, ptr %ptr, align 2
   ret void
 }
@@ -109,7 +109,7 @@ define void @interleave_nxi32_factor2(ptr %ptr, <vscale x 4 x i32> %l, <vscale x
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[L]], <vscale x 4 x i32> [[R]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> %l, <vscale x 4 x i32> %r)
+  %interleave = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %l, <vscale x 4 x i32> %r)
   store <vscale x 8 x i32> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -120,7 +120,7 @@ define void @interleave_nxi64_factor2(ptr %ptr, <vscale x 2 x i64> %l, <vscale x
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[L]], <vscale x 2 x i64> [[R]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64> %l, <vscale x 2 x i64> %r)
+  %interleave = tail call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %l, <vscale x 2 x i64> %r)
   store <vscale x 4 x i64> %interleave, ptr %ptr, align 8
   ret void
 }
@@ -131,7 +131,7 @@ define void @interleave_nxfloat_factor2(ptr %ptr, <vscale x 4 x float> %l, <vsca
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[L]], <vscale x 4 x float> [[R]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %l, <vscale x 4 x float> %r)
+  %interleave = tail call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %l, <vscale x 4 x float> %r)
   store <vscale x 8 x float> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -142,7 +142,7 @@ define void @interleave_nxdouble_factor2(ptr %ptr, <vscale x 2 x double> %l, <vs
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[L]], <vscale x 2 x double> [[R]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %l, <vscale x 2 x double> %r)
+  %interleave = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %l, <vscale x 2 x double> %r)
   store <vscale x 4 x double> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -153,7 +153,7 @@ define void @interleave_nxptr_factor2(ptr %ptr, <vscale x 2 x ptr> %l, <vscale x
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2p0(<vscale x 2 x ptr> [[L]], <vscale x 2 x ptr> [[R]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 4 x ptr> @llvm.experimental.vector.interleave2.nxv4p0(<vscale x 2 x ptr> %l, <vscale x 2 x ptr> %r)
+  %interleave = tail call <vscale x 4 x ptr> @llvm.vector.interleave2.nxv4p0(<vscale x 2 x ptr> %l, <vscale x 2 x ptr> %r)
   store <vscale x 4 x ptr> %interleave, ptr %ptr, align 4
   ret void
 }
@@ -192,7 +192,7 @@ define { <vscale x 16 x i32>, <vscale x 16 x i32> } @deinterleave_wide_nxi32_fac
 ; CHECK-NEXT:    ret { <vscale x 16 x i32>, <vscale x 16 x i32> } [[TMP22]]
 ;
   %load = load <vscale x 32 x i32>, ptr %ptr, align 4
-  %deinterleave = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.experimental.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %load)
+  %deinterleave = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %load)
   ret { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave
 }
 
@@ -216,7 +216,7 @@ define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_wide_nxdou
 ; CHECK-NEXT:    ret { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]]
 ;
   %load = load <vscale x 8 x double>, ptr %ptr, align 8
-  %deinterleave = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %load)
+  %deinterleave = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %load)
   ret { <vscale x 4 x double>, <vscale x 4 x double> } %deinterleave
 }
 
@@ -233,32 +233,32 @@ define void @interleave_wide_nxdouble_factor2(ptr %ptr, <vscale x 4 x double> %l
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP4]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double> %l, <vscale x 4 x double> %r)
+  %interleave = tail call <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double> %l, <vscale x 4 x double> %r)
   store <vscale x 8 x double> %interleave, ptr %ptr, align 4
   ret void
 }
 
-declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
-declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.experimental.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
-declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.experimental.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
-declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
-declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
-declare { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.experimental.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr>)
+declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr>)
 
 ; Larger deinterleaves to test 'legalization'
-declare { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.experimental.vector.deinterleave2.nxv32i32(<vscale x 32 x i32>)
-declare { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double>)
+declare { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32>)
+declare { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double>)
 
-declare <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
-declare <vscale x 16 x i16> @llvm.experimental.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
-declare <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
-declare <vscale x 4 x i64> @llvm.experimental.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
-declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
-declare <vscale x 4 x ptr> @llvm.experimental.vector.interleave2.nxv4p0(<vscale x 2 x ptr>, <vscale x 2 x ptr>)
+declare <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x ptr> @llvm.vector.interleave2.nxv4p0(<vscale x 2 x ptr>, <vscale x 2 x ptr>)
 
 ; Larger interleaves to test 'legalization'
-declare <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
+declare <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
 
 attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
index 45e2c36836ff..73f26814f3a4 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
@@ -511,7 +511,7 @@ define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_nxptr_fact
 ; CHECK-NEXT:    ret { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]]
 ;
   %wide.vec = load <vscale x 8 x double>, ptr %ptr, align 8
-  %ldN = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %wide.vec)
+  %ldN = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %wide.vec)
   ret { <vscale x 4 x double>, <vscale x 4 x double> } %ldN
 }
 
diff --git a/llvm/test/Transforms/LoopUnroll/unroll-remove-redundant-dbg.ll b/llvm/test/Transforms/LoopUnroll/unroll-remove-redundant-dbg.ll
new file mode 100644
index 000000000000..66cd4d454443
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/unroll-remove-redundant-dbg.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -S -passes=loop-unroll | FileCheck %s
+
+define i64 @d(i1 %tobool.not, i32 %add, i64 %conv23) !dbg !14{
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  ; There should be only one "llvm.dbg.vale" after loop unrolling
+  ; CHECK: call void @llvm.dbg.value
+  ; CHECK-NOT: call void @llvm.dbg.value
+
+  %k.045 = phi i64 [ 0, %entry ], [ %k.046, %for.body ]
+  tail call void @llvm.dbg.value(metadata i32 0, metadata !13, metadata !DIExpression()), !dbg !17
+  %k.046 = add nuw nsw i64 %k.045, 1
+  %exitcond = icmp ne i64 %k.046, 5
+  br i1 %exitcond, label %for.body, label %for.end22
+
+for.end22:                                        ; preds = %for.body
+  ret i64 %k.046
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!12}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C89, file: !1, producer: "clang version 19.0.0git (https://github.com/llvm/llvm-project.git ec062f5b33ed22c61742e3c1486f6cba915801e0)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "unroll-remove-redundant-dbg.c", directory: "", checksumkind: CSK_MD5, checksum: "aa30a1d8c04deb9b0f3885c258d2b674")
+!2 = !{!3, !8, !10}
+!3 = !DIGlobalVariableExpression(var: !4, expr: !DIExpression())
+!4 = distinct !DIGlobalVariable(name: "a", scope: !0, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true)
+!5 = !DIDerivedType(tag: DW_TAG_typedef, name: "uint32_t", file: !6, line: 198, baseType: !7)
+!6 = !DIFile(filename: "/usr/include/stdint.h", directory: "", checksumkind: CSK_MD5, checksum: "da031bcff2d0c1d65aa92e7e68a44ef3")
+!7 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!8 = !DIGlobalVariableExpression(var: !9, expr: !DIExpression())
+!9 = distinct !DIGlobalVariable(name: "c", scope: !0, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true)
+!10 = !DIGlobalVariableExpression(var: !11, expr: !DIExpression())
+!11 = distinct !DIGlobalVariable(name: "b", scope: !0, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true)
+!12 = !{i32 2, !"Debug Info Version", i32 3}
+!13 = !DILocalVariable(name: "f", scope: !14, file: !1, line: 4, type: !5)
+!14 = distinct !DISubprogram(name: "d", scope: !1, file: !1, line: 3, type: !15, scopeLine: 3, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !16)
+!15 = !DISubroutineType(types: !16)
+!16 = !{}
+!17 = !DILocation(line: 0, scope: !14)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
new file mode 100644
index 000000000000..14b5ee244080
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -0,0 +1,889 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p loop-vectorize -S %s | FileCheck --check-prefix=DEFAULT %s
+; RUN: opt -p loop-vectorize  -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S %s | FileCheck --check-prefix=PRED %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-macosx14.0.0"
+
+define void @invar_cond_gep_store(ptr %dst, i32 %0) {
+; DEFAULT-LABEL: define void @invar_cond_gep_store(
+; DEFAULT-SAME: ptr [[DST:%.*]], i32 [[TMP0:%.*]]) {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ]
+; DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
+; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
+; DEFAULT-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
+; DEFAULT-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; DEFAULT-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; DEFAULT:       pred.store.if:
+; DEFAULT-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
+; DEFAULT-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 1
+; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP5]]
+; DEFAULT-NEXT:    store i32 1, ptr [[TMP6]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; DEFAULT:       pred.store.continue:
+; DEFAULT-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; DEFAULT-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
+; DEFAULT:       pred.store.if1:
+; DEFAULT-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 1
+; DEFAULT-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 1
+; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP9]]
+; DEFAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; DEFAULT:       pred.store.continue2:
+; DEFAULT-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; DEFAULT-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; DEFAULT:       pred.store.if3:
+; DEFAULT-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 2
+; DEFAULT-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], 1
+; DEFAULT-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP13]]
+; DEFAULT-NEXT:    store i32 1, ptr [[TMP14]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; DEFAULT:       pred.store.continue4:
+; DEFAULT-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; DEFAULT-NEXT:    br i1 [[TMP15]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; DEFAULT:       pred.store.if5:
+; DEFAULT-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 3
+; DEFAULT-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 1
+; DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP17]]
+; DEFAULT-NEXT:    store i32 1, ptr [[TMP18]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; DEFAULT:       pred.store.continue6:
+; DEFAULT-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
+; DEFAULT-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; DEFAULT:       pred.store.if7:
+; DEFAULT-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 4
+; DEFAULT-NEXT:    [[TMP21:%.*]] = add i64 [[TMP20]], 1
+; DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP21]]
+; DEFAULT-NEXT:    store i32 1, ptr [[TMP22]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; DEFAULT:       pred.store.continue8:
+; DEFAULT-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
+; DEFAULT-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; DEFAULT:       pred.store.if9:
+; DEFAULT-NEXT:    [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 5
+; DEFAULT-NEXT:    [[TMP25:%.*]] = add i64 [[TMP24]], 1
+; DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP25]]
+; DEFAULT-NEXT:    store i32 1, ptr [[TMP26]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; DEFAULT:       pred.store.continue10:
+; DEFAULT-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
+; DEFAULT-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; DEFAULT:       pred.store.if11:
+; DEFAULT-NEXT:    [[TMP28:%.*]] = add i64 [[OFFSET_IDX]], 6
+; DEFAULT-NEXT:    [[TMP29:%.*]] = add i64 [[TMP28]], 1
+; DEFAULT-NEXT:    [[TMP30:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP29]]
+; DEFAULT-NEXT:    store i32 1, ptr [[TMP30]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; DEFAULT:       pred.store.continue12:
+; DEFAULT-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
+; DEFAULT-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14]]
+; DEFAULT:       pred.store.if13:
+; DEFAULT-NEXT:    [[TMP32:%.*]] = add i64 [[OFFSET_IDX]], 7
+; DEFAULT-NEXT:    [[TMP33:%.*]] = add i64 [[TMP32]], 1
+; DEFAULT-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP33]]
+; DEFAULT-NEXT:    store i32 1, ptr [[TMP34]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE14]]
+; DEFAULT:       pred.store.continue14:
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; DEFAULT-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
+; DEFAULT-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 97, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    br label [[LOOP_HEADER:%.*]]
+; DEFAULT:       loop.header:
+; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; DEFAULT-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; DEFAULT-NEXT:    [[CMP9:%.*]] = icmp eq i32 [[TMP0]], 0
+; DEFAULT-NEXT:    br i1 [[CMP9]], label [[THEN:%.*]], label [[LOOP_LATCH]]
+; DEFAULT:       then:
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV_NEXT]]
+; DEFAULT-NEXT:    store i32 1, ptr [[GEP]], align 4
+; DEFAULT-NEXT:    br label [[LOOP_LATCH]]
+; DEFAULT:       loop.latch:
+; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 100
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @invar_cond_gep_store(
+; PRED-SAME: ptr [[DST:%.*]], i32 [[TMP0:%.*]]) {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
+; PRED-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
+; PRED-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
+; PRED-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; PRED-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; PRED:       pred.store.if:
+; PRED-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
+; PRED-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], 1
+; PRED-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP4]]
+; PRED-NEXT:    store i32 1, ptr [[TMP5]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; PRED:       pred.store.continue:
+; PRED-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; PRED-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
+; PRED:       pred.store.if1:
+; PRED-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 1
+; PRED-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], 1
+; PRED-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP8]]
+; PRED-NEXT:    store i32 1, ptr [[TMP9]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; PRED:       pred.store.continue2:
+; PRED-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; PRED-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; PRED:       pred.store.if3:
+; PRED-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 2
+; PRED-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 1
+; PRED-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP12]]
+; PRED-NEXT:    store i32 1, ptr [[TMP13]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; PRED:       pred.store.continue4:
+; PRED-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; PRED-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
+; PRED:       pred.store.if5:
+; PRED-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 3
+; PRED-NEXT:    [[TMP16:%.*]] = add i64 [[TMP15]], 1
+; PRED-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP16]]
+; PRED-NEXT:    store i32 1, ptr [[TMP17]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; PRED:       pred.store.continue6:
+; PRED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; PRED-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; PRED-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 101, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; PRED-NEXT:    br label [[LOOP_HEADER:%.*]]
+; PRED:       loop.header:
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; PRED-NEXT:    [[CMP9:%.*]] = icmp eq i32 [[TMP0]], 0
+; PRED-NEXT:    br i1 [[CMP9]], label [[THEN:%.*]], label [[LOOP_LATCH]]
+; PRED:       then:
+; PRED-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV_NEXT]]
+; PRED-NEXT:    store i32 1, ptr [[GEP]], align 4
+; PRED-NEXT:    br label [[LOOP_LATCH]]
+; PRED:       loop.latch:
+; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 100
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop.latch ]
+  %iv.next = add i64 %iv, 1
+  %cmp9 = icmp eq i32 %0, 0
+  br i1 %cmp9, label %then, label %loop.latch
+
+then:
+  %gep = getelementptr i32, ptr %dst, i64 %iv.next
+  store i32 1, ptr %gep, align 4
+  br label %loop.latch
+
+loop.latch:
+  %ec = icmp eq i64 %iv, 100
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+declare double @llvm.fabs.f64(double) #0
+
+define void @loop_dependent_cond(ptr %src, ptr noalias %dst, i64 %N) {
+; DEFAULT-LABEL: define void @loop_dependent_cond(
+; DEFAULT-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE7:%.*]] ]
+; DEFAULT-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[SRC]], i64 [[TMP1]]
+; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[SRC]], i64 [[TMP2]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[TMP3]], i32 0
+; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr double, ptr [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
+; DEFAULT-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP6]], align 8
+; DEFAULT-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD]])
+; DEFAULT-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD1]])
+; DEFAULT-NEXT:    [[TMP9:%.*]] = fcmp ogt <2 x double> [[TMP7]], <double 1.000000e+00, double 1.000000e+00>
+; DEFAULT-NEXT:    [[TMP10:%.*]] = fcmp ogt <2 x double> [[TMP8]], <double 1.000000e+00, double 1.000000e+00>
+; DEFAULT-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
+; DEFAULT-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; DEFAULT:       pred.store.if:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; DEFAULT:       pred.store.continue:
+; DEFAULT-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
+; DEFAULT-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]]
+; DEFAULT:       pred.store.if2:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE3]]
+; DEFAULT:       pred.store.continue3:
+; DEFAULT-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
+; DEFAULT-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
+; DEFAULT:       pred.store.if4:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE5]]
+; DEFAULT:       pred.store.continue5:
+; DEFAULT-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
+; DEFAULT-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]]
+; DEFAULT:       pred.store.if6:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE7]]
+; DEFAULT:       pred.store.continue7:
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; DEFAULT-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_END123:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    br label [[FOR_BODY112:%.*]]
+; DEFAULT:       loop.header:
+; DEFAULT-NEXT:    [[IV175:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT176:%.*]], [[FOR_INC121:%.*]] ]
+; DEFAULT-NEXT:    [[ARRAYIDX114:%.*]] = getelementptr double, ptr [[SRC]], i64 [[IV175]]
+; DEFAULT-NEXT:    [[TMP16:%.*]] = load double, ptr [[ARRAYIDX114]], align 8
+; DEFAULT-NEXT:    [[TMP17:%.*]] = tail call double @llvm.fabs.f64(double [[TMP16]])
+; DEFAULT-NEXT:    [[CMP115:%.*]] = fcmp ogt double [[TMP17]], 1.000000e+00
+; DEFAULT-NEXT:    br i1 [[CMP115]], label [[IF_THEN117:%.*]], label [[FOR_INC121]]
+; DEFAULT:       then:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[FOR_INC121]]
+; DEFAULT:       loop.latch:
+; DEFAULT-NEXT:    [[IV_NEXT176]] = add i64 [[IV175]], 1
+; DEFAULT-NEXT:    [[EXITCOND180_NOT:%.*]] = icmp eq i64 [[IV175]], [[N]]
+; DEFAULT-NEXT:    br i1 [[EXITCOND180_NOT]], label [[FOR_END123]], label [[FOR_BODY112]], !llvm.loop [[LOOP5:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @loop_dependent_cond(
+; PRED-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    br label [[FOR_BODY112:%.*]]
+; PRED:       loop.header:
+; PRED-NEXT:    [[IV175:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT176:%.*]], [[FOR_INC121:%.*]] ]
+; PRED-NEXT:    [[ARRAYIDX114:%.*]] = getelementptr double, ptr [[SRC]], i64 [[IV175]]
+; PRED-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX114]], align 8
+; PRED-NEXT:    [[TMP1:%.*]] = tail call double @llvm.fabs.f64(double [[TMP0]])
+; PRED-NEXT:    [[CMP115:%.*]] = fcmp ogt double [[TMP1]], 1.000000e+00
+; PRED-NEXT:    br i1 [[CMP115]], label [[IF_THEN117:%.*]], label [[FOR_INC121]]
+; PRED:       then:
+; PRED-NEXT:    store i32 0, ptr [[DST]], align 4
+; PRED-NEXT:    br label [[FOR_INC121]]
+; PRED:       loop.latch:
+; PRED-NEXT:    [[IV_NEXT176]] = add i64 [[IV175]], 1
+; PRED-NEXT:    [[EXITCOND180_NOT:%.*]] = icmp eq i64 [[IV175]], [[N]]
+; PRED-NEXT:    br i1 [[EXITCOND180_NOT]], label [[FOR_END123:%.*]], label [[FOR_BODY112]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep = getelementptr double, ptr %src, i64 %iv
+  %l = load double, ptr %gep, align 8
+  %abs = tail call double @llvm.fabs.f64(double %l)
+  %cmp = fcmp ogt double %abs, 1.000000e+00
+  br i1 %cmp, label %then, label %loop.latch
+
+then:
+  store i32 0, ptr %dst, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, %N
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @invar_cond_chain_1(ptr %I, ptr noalias %src, i1 %c) {
+; DEFAULT-LABEL: define void @invar_cond_chain_1(
+; DEFAULT-SAME: ptr [[I:%.*]], ptr noalias [[SRC:%.*]], i1 [[C:%.*]]) {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; DEFAULT-NEXT:    [[I1:%.*]] = ptrtoint ptr [[I]] to i64
+; DEFAULT-NEXT:    [[TMP29:%.*]] = sub i64 [[I1]], [[SRC2]]
+; DEFAULT-NEXT:    [[TMP0:%.*]] = lshr i64 [[TMP29]], 2
+; DEFAULT-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 8
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 8
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], 4
+; DEFAULT-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP2]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE17:%.*]] ]
+; DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; DEFAULT-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
+; DEFAULT-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 16
+; DEFAULT-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; DEFAULT-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP4]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 0
+; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
+; DEFAULT-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
+; DEFAULT-NEXT:    [[TMP7:%.*]] = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
+; DEFAULT-NEXT:    [[TMP8:%.*]] = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
+; DEFAULT-NEXT:    [[TMP9:%.*]] = or <4 x i1> [[TMP7]], zeroinitializer
+; DEFAULT-NEXT:    [[TMP10:%.*]] = or <4 x i1> [[TMP8]], zeroinitializer
+; DEFAULT-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP9]], i32 0
+; DEFAULT-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; DEFAULT:       pred.store.if:
+; DEFAULT-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0
+; DEFAULT-NEXT:    store i32 [[TMP12]], ptr [[I]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; DEFAULT:       pred.store.continue:
+; DEFAULT-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP9]], i32 1
+; DEFAULT-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
+; DEFAULT:       pred.store.if5:
+; DEFAULT-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 1
+; DEFAULT-NEXT:    store i32 [[TMP14]], ptr [[I]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE5]]
+; DEFAULT:       pred.store.continue6:
+; DEFAULT-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[TMP9]], i32 2
+; DEFAULT-NEXT:    br i1 [[TMP15]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
+; DEFAULT:       pred.store.if7:
+; DEFAULT-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2
+; DEFAULT-NEXT:    store i32 [[TMP16]], ptr [[I]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE7]]
+; DEFAULT:       pred.store.continue8:
+; DEFAULT-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP9]], i32 3
+; DEFAULT-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
+; DEFAULT:       pred.store.if9:
+; DEFAULT-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
+; DEFAULT-NEXT:    store i32 [[TMP18]], ptr [[I]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; DEFAULT:       pred.store.continue10:
+; DEFAULT-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
+; DEFAULT-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
+; DEFAULT:       pred.store.if11:
+; DEFAULT-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[WIDE_LOAD3]], i32 0
+; DEFAULT-NEXT:    store i32 [[TMP20]], ptr [[I]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; DEFAULT:       pred.store.continue12:
+; DEFAULT-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP10]], i32 1
+; DEFAULT-NEXT:    br i1 [[TMP21]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
+; DEFAULT:       pred.store.if13:
+; DEFAULT-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[WIDE_LOAD3]], i32 1
+; DEFAULT-NEXT:    store i32 [[TMP22]], ptr [[I]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE13]]
+; DEFAULT:       pred.store.continue14:
+; DEFAULT-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP10]], i32 2
+; DEFAULT-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]]
+; DEFAULT:       pred.store.if15:
+; DEFAULT-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[WIDE_LOAD3]], i32 2
+; DEFAULT-NEXT:    store i32 [[TMP24]], ptr [[I]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE15]]
+; DEFAULT:       pred.store.continue16:
+; DEFAULT-NEXT:    [[TMP25:%.*]] = extractelement <4 x i1> [[TMP10]], i32 3
+; DEFAULT-NEXT:    br i1 [[TMP25]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17]]
+; DEFAULT:       pred.store.if17:
+; DEFAULT-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[WIDE_LOAD3]], i32 3
+; DEFAULT-NEXT:    store i32 [[TMP26]], ptr [[I]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE17]]
+; DEFAULT:       pred.store.continue18:
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; DEFAULT-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP312_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    br label [[FOR_BODY313:%.*]]
+; DEFAULT:       loop.header:
+; DEFAULT-NEXT:    [[__BEGIN3_011973:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INCDEC_PTR329:%.*]], [[IF_END327:%.*]] ]
+; DEFAULT-NEXT:    [[TMP28:%.*]] = load i32, ptr [[__BEGIN3_011973]], align 4
+; DEFAULT-NEXT:    br i1 true, label [[IF_ELSE321:%.*]], label [[IF_THEN316:%.*]]
+; DEFAULT:       if:
+; DEFAULT-NEXT:    br label [[IF_END327_SINK_SPLIT:%.*]]
+; DEFAULT:       else.1:
+; DEFAULT-NEXT:    br i1 [[C]], label [[IF_THEN323:%.*]], label [[IF_END327]]
+; DEFAULT:       else.2:
+; DEFAULT-NEXT:    br label [[IF_END327_SINK_SPLIT]]
+; DEFAULT:       split:
+; DEFAULT-NEXT:    store i32 [[TMP28]], ptr [[I]], align 4
+; DEFAULT-NEXT:    br label [[IF_END327]]
+; DEFAULT:       loop.latch:
+; DEFAULT-NEXT:    [[INCDEC_PTR329]] = getelementptr inbounds i8, ptr [[__BEGIN3_011973]], i64 4
+; DEFAULT-NEXT:    [[CMP311_NOT:%.*]] = icmp eq ptr [[__BEGIN3_011973]], [[I]]
+; DEFAULT-NEXT:    br i1 [[CMP311_NOT]], label [[FOR_COND_CLEANUP312_LOOPEXIT]], label [[FOR_BODY313]], !llvm.loop [[LOOP7:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @invar_cond_chain_1(
+; PRED-SAME: ptr [[I:%.*]], ptr noalias [[SRC:%.*]], i1 [[C:%.*]]) {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    br label [[FOR_BODY313:%.*]]
+; PRED:       loop.header:
+; PRED-NEXT:    [[__BEGIN3_011973:%.*]] = phi ptr [ [[SRC]], [[ENTRY:%.*]] ], [ [[INCDEC_PTR329:%.*]], [[IF_END327:%.*]] ]
+; PRED-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__BEGIN3_011973]], align 4
+; PRED-NEXT:    br i1 true, label [[IF_ELSE321:%.*]], label [[IF_THEN316:%.*]]
+; PRED:       if:
+; PRED-NEXT:    br label [[IF_END327_SINK_SPLIT:%.*]]
+; PRED:       else.1:
+; PRED-NEXT:    br i1 [[C]], label [[IF_THEN323:%.*]], label [[IF_END327]]
+; PRED:       else.2:
+; PRED-NEXT:    br label [[IF_END327_SINK_SPLIT]]
+; PRED:       split:
+; PRED-NEXT:    store i32 [[TMP0]], ptr [[I]], align 4
+; PRED-NEXT:    br label [[IF_END327]]
+; PRED:       loop.latch:
+; PRED-NEXT:    [[INCDEC_PTR329]] = getelementptr inbounds i8, ptr [[__BEGIN3_011973]], i64 4
+; PRED-NEXT:    [[CMP311_NOT:%.*]] = icmp eq ptr [[__BEGIN3_011973]], [[I]]
+; PRED-NEXT:    br i1 [[CMP311_NOT]], label [[FOR_COND_CLEANUP312_LOOPEXIT:%.*]], label [[FOR_BODY313]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %src, %entry ], [ %ptr.iv.next, %loop.latch ]
+  %l = load i32, ptr %ptr.iv, align 4
+  br i1 true, label %else.1, label %if
+
+if:
+  br label %split
+
+else.1:
+  br i1 %c, label %else.2, label %loop.latch
+
+else.2:
+  br label %split
+
+split:
+  store i32 %l, ptr %I, align 4
+  br label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 4
+  %ec = icmp eq ptr %ptr.iv, %I
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @invar_cond_chain_2(ptr %I, ptr noalias %src, ptr noalias %dst, i32 %a) {
+; DEFAULT-LABEL: define void @invar_cond_chain_2(
+; DEFAULT-SAME: ptr [[I:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[A:%.*]]) {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; DEFAULT-NEXT:    [[I1:%.*]] = ptrtoint ptr [[I]] to i64
+; DEFAULT-NEXT:    [[TMP0:%.*]] = sub i64 [[I1]], [[SRC2]]
+; DEFAULT-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
+; DEFAULT-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 4
+; DEFAULT-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE15:%.*]] ]
+; DEFAULT-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
+; DEFAULT-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer
+; DEFAULT-NEXT:    [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; DEFAULT-NEXT:    [[TMP7:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; DEFAULT-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
+; DEFAULT-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; DEFAULT:       pred.store.if:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; DEFAULT:       pred.store.continue:
+; DEFAULT-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP6]], i32 1
+; DEFAULT-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]]
+; DEFAULT:       pred.store.if3:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE3]]
+; DEFAULT:       pred.store.continue4:
+; DEFAULT-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP6]], i32 2
+; DEFAULT-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
+; DEFAULT:       pred.store.if5:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE5]]
+; DEFAULT:       pred.store.continue6:
+; DEFAULT-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP6]], i32 3
+; DEFAULT-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
+; DEFAULT:       pred.store.if7:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE7]]
+; DEFAULT:       pred.store.continue8:
+; DEFAULT-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0
+; DEFAULT-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
+; DEFAULT:       pred.store.if9:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; DEFAULT:       pred.store.continue10:
+; DEFAULT-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP7]], i32 1
+; DEFAULT-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
+; DEFAULT:       pred.store.if11:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; DEFAULT:       pred.store.continue12:
+; DEFAULT-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP7]], i32 2
+; DEFAULT-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
+; DEFAULT:       pred.store.if13:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE13]]
+; DEFAULT:       pred.store.continue14:
+; DEFAULT-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3
+; DEFAULT-NEXT:    br i1 [[TMP15]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15]]
+; DEFAULT:       pred.store.if15:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE15]]
+; DEFAULT:       pred.store.continue16:
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; DEFAULT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP312_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    br label [[FOR_BODY313:%.*]]
+; DEFAULT:       loop.header:
+; DEFAULT-NEXT:    [[__BEGIN3_01197:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INCDEC_PTR329:%.*]], [[IF_END327:%.*]] ]
+; DEFAULT-NEXT:    [[CMP315_NOT:%.*]] = icmp sgt i32 [[A]], 0
+; DEFAULT-NEXT:    br i1 [[CMP315_NOT]], label [[IF_END327]], label [[IF_THEN316:%.*]]
+; DEFAULT:       if:
+; DEFAULT-NEXT:    br label [[IF_END327_SINK_SPLIT:%.*]]
+; DEFAULT:       else:
+; DEFAULT-NEXT:    store i32 0, ptr [[DST]], align 4
+; DEFAULT-NEXT:    br label [[IF_END327]]
+; DEFAULT:       loop.latch:
+; DEFAULT-NEXT:    [[INCDEC_PTR329]] = getelementptr inbounds i8, ptr [[__BEGIN3_01197]], i64 4
+; DEFAULT-NEXT:    [[CMP311_NOT:%.*]] = icmp eq ptr [[__BEGIN3_01197]], [[I]]
+; DEFAULT-NEXT:    br i1 [[CMP311_NOT]], label [[FOR_COND_CLEANUP312_LOOPEXIT]], label [[FOR_BODY313]], !llvm.loop [[LOOP9:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @invar_cond_chain_2(
+; PRED-SAME: ptr [[I:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[A:%.*]]) {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; PRED-NEXT:    [[I1:%.*]] = ptrtoint ptr [[I]] to i64
+; PRED-NEXT:    [[TMP0:%.*]] = sub i64 [[I1]], [[SRC2]]
+; PRED-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
+; PRED-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP2]], 3
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 4
+; PRED-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; PRED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP2]], 1
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE11:%.*]] ]
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; PRED-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; PRED-NEXT:    [[TMP4:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT3]]
+; PRED-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i32> [[BROADCAST_SPLAT5]], zeroinitializer
+; PRED-NEXT:    [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
+; PRED-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP6]], <4 x i1> zeroinitializer
+; PRED-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0
+; PRED-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; PRED:       pred.store.if:
+; PRED-NEXT:    store i32 0, ptr [[DST]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; PRED:       pred.store.continue:
+; PRED-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP7]], i32 1
+; PRED-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
+; PRED:       pred.store.if7:
+; PRED-NEXT:    store i32 0, ptr [[DST]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE7]]
+; PRED:       pred.store.continue8:
+; PRED-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP7]], i32 2
+; PRED-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
+; PRED:       pred.store.if9:
+; PRED-NEXT:    store i32 0, ptr [[DST]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE9]]
+; PRED:       pred.store.continue10:
+; PRED-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3
+; PRED-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11]]
+; PRED:       pred.store.if11:
+; PRED-NEXT:    store i32 0, ptr [[DST]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE11]]
+; PRED:       pred.store.continue12:
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; PRED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; PRED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[FOR_COND_CLEANUP312_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY:%.*]] ]
+; PRED-NEXT:    br label [[FOR_BODY313:%.*]]
+; PRED:       loop.header:
+; PRED-NEXT:    [[__BEGIN3_01197:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INCDEC_PTR329:%.*]], [[IF_END327:%.*]] ]
+; PRED-NEXT:    [[CMP315_NOT:%.*]] = icmp sgt i32 [[A]], 0
+; PRED-NEXT:    br i1 [[CMP315_NOT]], label [[IF_END327]], label [[IF_THEN316:%.*]]
+; PRED:       if:
+; PRED-NEXT:    br label [[IF_END327_SINK_SPLIT:%.*]]
+; PRED:       else:
+; PRED-NEXT:    store i32 0, ptr [[DST]], align 4
+; PRED-NEXT:    br label [[IF_END327]]
+; PRED:       loop.latch:
+; PRED-NEXT:    [[INCDEC_PTR329]] = getelementptr inbounds i8, ptr [[__BEGIN3_01197]], i64 4
+; PRED-NEXT:    [[CMP311_NOT:%.*]] = icmp eq ptr [[__BEGIN3_01197]], [[I]]
+; PRED-NEXT:    br i1 [[CMP311_NOT]], label [[FOR_COND_CLEANUP312_LOOPEXIT]], label [[FOR_BODY313]], !llvm.loop [[LOOP5:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %src, %entry ], [ %ptr.iv.next, %loop.latch ]
+  %cmp315.not = icmp sgt i32 %a, 0
+  br i1 %cmp315.not, label %loop.latch, label %if
+
+if:
+  br label %else
+
+else:
+  store i32 0, ptr %dst, align 4
+  br label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds i8, ptr %ptr.iv, i64 4
+  %cmp311.not = icmp eq ptr %ptr.iv, %I
+  br i1 %cmp311.not, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @latch_branch_cost(ptr %dst) {
+; DEFAULT-LABEL: define void @latch_branch_cost(
+; DEFAULT-SAME: ptr [[DST:%.*]]) {
+; DEFAULT-NEXT:  iter.check:
+; DEFAULT-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.main.loop.iter.check:
+; DEFAULT-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 16
+; DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP1]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP2]], i32 0
+; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP2]], i32 16
+; DEFAULT-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP6]], align 1
+; DEFAULT-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP5]], align 1
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; DEFAULT-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
+; DEFAULT-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; DEFAULT:       vec.epilog.iter.check:
+; DEFAULT-NEXT:    br i1 false, label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; DEFAULT:       vec.epilog.ph:
+; DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; DEFAULT-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; DEFAULT:       vec.epilog.vector.body:
+; DEFAULT-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX1]], 0
+; DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
+; DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
+; DEFAULT-NEXT:    store <4 x i8> zeroinitializer, ptr [[TMP9]], align 1
+; DEFAULT-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
+; DEFAULT-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 100
+; DEFAULT-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; DEFAULT:       vec.epilog.middle.block:
+; DEFAULT-NEXT:    br i1 true, label [[FOR_END]], label [[SCALAR_PH]]
+; DEFAULT:       vec.epilog.scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDVARS_IV]]
+; DEFAULT-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
+; DEFAULT-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
+; DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @latch_branch_cost(
+; PRED-SAME: ptr [[DST:%.*]]) {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
+; PRED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
+; PRED-NEXT:    [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], <i64 99, i64 99, i64 99, i64 99, i64 99, i64 99, i64 99, i64 99>
+; PRED-NEXT:    [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
+; PRED-NEXT:    br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; PRED:       pred.store.if:
+; PRED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; PRED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]]
+; PRED-NEXT:    store i8 0, ptr [[TMP3]], align 1
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; PRED:       pred.store.continue:
+; PRED-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1
+; PRED-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
+; PRED:       pred.store.if1:
+; PRED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 1
+; PRED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP5]]
+; PRED-NEXT:    store i8 0, ptr [[TMP6]], align 1
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; PRED:       pred.store.continue2:
+; PRED-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2
+; PRED-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; PRED:       pred.store.if3:
+; PRED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 2
+; PRED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
+; PRED-NEXT:    store i8 0, ptr [[TMP9]], align 1
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; PRED:       pred.store.continue4:
+; PRED-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3
+; PRED-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
+; PRED:       pred.store.if5:
+; PRED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 3
+; PRED-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
+; PRED-NEXT:    store i8 0, ptr [[TMP12]], align 1
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE7]]
+; PRED:       pred.store.continue6:
+; PRED-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4
+; PRED-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; PRED:       pred.store.if7:
+; PRED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 4
+; PRED-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]]
+; PRED-NEXT:    store i8 0, ptr [[TMP15]], align 1
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; PRED:       pred.store.continue8:
+; PRED-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5
+; PRED-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; PRED:       pred.store.if9:
+; PRED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 5
+; PRED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
+; PRED-NEXT:    store i8 0, ptr [[TMP18]], align 1
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; PRED:       pred.store.continue10:
+; PRED-NEXT:    [[TMP19:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6
+; PRED-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; PRED:       pred.store.if11:
+; PRED-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 6
+; PRED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP20]]
+; PRED-NEXT:    store i8 0, ptr [[TMP21]], align 1
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; PRED:       pred.store.continue12:
+; PRED-NEXT:    [[TMP22:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7
+; PRED-NEXT:    br i1 [[TMP22]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE6]]
+; PRED:       pred.store.if13:
+; PRED-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 7
+; PRED-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP23]]
+; PRED-NEXT:    store i8 0, ptr [[TMP24]], align 1
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; PRED:       pred.store.continue14:
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; PRED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+; PRED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 104
+; PRED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 104, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; PRED-NEXT:    br label [[FOR_BODY:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; PRED-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; PRED-NEXT:    store i8 0, ptr [[GEP]], align 1
+; PRED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[IV]], 1
+; PRED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
+; PRED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i8, ptr %dst, i64 %iv
+  store i8 0, ptr %gep, align 1
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+;.
+; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; DEFAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; DEFAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; DEFAULT: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; DEFAULT: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; DEFAULT: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]}
+;.
+; PRED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; PRED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; PRED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; PRED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; PRED: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; PRED: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; PRED: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
index 2be525a2abc0..2cc0aa2ffca5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
@@ -22,8 +22,8 @@ define i32 @PR33613(ptr %b, double %j, i32 %d) #0 {
 ; CHECK-VF4UF2-LABEL: @PR33613
 ; CHECK-VF4UF2: vector.body
 ; CHECK-VF4UF2: %[[VEC_RECUR:.*]] = phi <vscale x 4 x double> [ {{.*}}, %vector.ph ], [ {{.*}}, %vector.body ]
-; CHECK-VF4UF2: %[[SPLICE1:.*]] = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %[[VEC_RECUR]], <vscale x 4 x double> {{.*}}, i32 -1)
-; CHECK-VF4UF2-NEXT: %[[SPLICE2:.*]] = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %{{.*}}, <vscale x 4 x double> %{{.*}}, i32 -1)
+; CHECK-VF4UF2: %[[SPLICE1:.*]] = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> %[[VEC_RECUR]], <vscale x 4 x double> {{.*}}, i32 -1)
+; CHECK-VF4UF2-NEXT: %[[SPLICE2:.*]] = call <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double> %{{.*}}, <vscale x 4 x double> %{{.*}}, i32 -1)
 ; CHECK-VF4UF2-NOT: insertelement <vscale x 4 x double>
 ; CHECK-VF4UF2: middle.block
 entry:
@@ -71,7 +71,7 @@ define void @PR34711(ptr %a, ptr %b, ptr %c, i64 %n) #0 {
 ; CHECK-VF4UF1: vector.body
 ; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[MGATHER:.*]], %vector.body ]
 ; CHECK-VF4UF1: %[[MGATHER]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> {{.*}}, i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison)
-; CHECK-VF4UF1-NEXT: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %[[VEC_RECUR]], <vscale x 4 x i16> %[[MGATHER]], i32 -1)
+; CHECK-VF4UF1-NEXT: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %[[VEC_RECUR]], <vscale x 4 x i16> %[[MGATHER]], i32 -1)
 ; CHECK-VF4UF1-NEXT: %[[SXT1:.*]] = sext <vscale x 4 x i16> %[[SPLICE]] to <vscale x 4 x i32>
 ; CHECK-VF4UF1-NEXT: %[[SXT2:.*]] = sext <vscale x 4 x i16> %[[MGATHER]] to <vscale x 4 x i32>
 ; CHECK-VF4UF1-NEXT: mul nsw <vscale x 4 x i32> %[[SXT2]], %[[SXT1]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
new file mode 100644
index 000000000000..c85ae6dba73e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -0,0 +1,969 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p loop-vectorize -S %s | FileCheck --check-prefixes=DEFAULT %s
+; RUN: opt -p loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S %s | FileCheck --check-prefixes=PRED %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-macosx14.0.0"
+
+define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
+; DEFAULT-LABEL: define void @iv_casts(
+; DEFAULT-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; DEFAULT-NEXT:  iter.check:
+; DEFAULT-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; DEFAULT-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; DEFAULT-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; DEFAULT:       vector.memcheck:
+; DEFAULT-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; DEFAULT-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; DEFAULT-NEXT:    [[TMP6:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; DEFAULT-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; DEFAULT-NEXT:    br i1 [[DIFF_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; DEFAULT:       vector.main.loop.iter.check:
+; DEFAULT-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 16
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[TMP0]], [[TMP8]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK3]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 16
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP10]]
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 16
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[X]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP13:%.*]] = trunc <vscale x 8 x i32> [[BROADCAST_SPLAT]] to <vscale x 8 x i16>
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 8
+; DEFAULT-NEXT:    [[TMP17:%.*]] = add i64 [[TMP16]], 0
+; DEFAULT-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 1
+; DEFAULT-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
+; DEFAULT-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP14]]
+; DEFAULT-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP19]]
+; DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
+; DEFAULT-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP23]], 8
+; DEFAULT-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP24]]
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP22]], align 1
+; DEFAULT-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP25]], align 1
+; DEFAULT-NEXT:    [[TMP26:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
+; DEFAULT-NEXT:    [[TMP27:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
+; DEFAULT-NEXT:    [[TMP28:%.*]] = mul <vscale x 8 x i16> [[TMP26]], [[TMP13]]
+; DEFAULT-NEXT:    [[TMP29:%.*]] = mul <vscale x 8 x i16> [[TMP27]], [[TMP13]]
+; DEFAULT-NEXT:    [[TMP30:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
+; DEFAULT-NEXT:    [[TMP31:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
+; DEFAULT-NEXT:    [[TMP32:%.*]] = or <vscale x 8 x i16> [[TMP28]], [[TMP30]]
+; DEFAULT-NEXT:    [[TMP33:%.*]] = or <vscale x 8 x i16> [[TMP29]], [[TMP31]]
+; DEFAULT-NEXT:    [[TMP34:%.*]] = lshr <vscale x 8 x i16> [[TMP32]], trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
+; DEFAULT-NEXT:    [[TMP35:%.*]] = lshr <vscale x 8 x i16> [[TMP33]], trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
+; DEFAULT-NEXT:    [[TMP36:%.*]] = trunc <vscale x 8 x i16> [[TMP34]] to <vscale x 8 x i8>
+; DEFAULT-NEXT:    [[TMP37:%.*]] = trunc <vscale x 8 x i16> [[TMP35]] to <vscale x 8 x i8>
+; DEFAULT-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]]
+; DEFAULT-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP19]]
+; DEFAULT-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[TMP38]], i32 0
+; DEFAULT-NEXT:    [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP41]], 8
+; DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP38]], i64 [[TMP42]]
+; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP36]], ptr [[TMP40]], align 1
+; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP37]], ptr [[TMP43]], align 1
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
+; DEFAULT-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; DEFAULT:       vec.epilog.iter.check:
+; DEFAULT-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT:    [[TMP45:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP46:%.*]] = mul i64 [[TMP45]], 4
+; DEFAULT-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP46]]
+; DEFAULT-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; DEFAULT:       vec.epilog.ph:
+; DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; DEFAULT-NEXT:    [[TMP47:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP47]], 4
+; DEFAULT-NEXT:    [[N_MOD_VF5:%.*]] = urem i64 [[TMP0]], [[TMP48]]
+; DEFAULT-NEXT:    [[N_VEC6:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF5]]
+; DEFAULT-NEXT:    [[TMP49:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP49]], 4
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT8]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP51:%.*]] = trunc <vscale x 4 x i32> [[BROADCAST_SPLAT9]] to <vscale x 4 x i16>
+; DEFAULT-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; DEFAULT:       vec.epilog.vector.body:
+; DEFAULT-NEXT:    [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP52:%.*]] = add i64 [[INDEX10]], 0
+; DEFAULT-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP52]]
+; DEFAULT-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[TMP53]], i32 0
+; DEFAULT-NEXT:    [[WIDE_LOAD11:%.*]] = load <vscale x 4 x i8>, ptr [[TMP54]], align 1
+; DEFAULT-NEXT:    [[TMP55:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD11]] to <vscale x 4 x i16>
+; DEFAULT-NEXT:    [[TMP56:%.*]] = mul <vscale x 4 x i16> [[TMP55]], [[TMP51]]
+; DEFAULT-NEXT:    [[TMP57:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD11]] to <vscale x 4 x i16>
+; DEFAULT-NEXT:    [[TMP58:%.*]] = or <vscale x 4 x i16> [[TMP56]], [[TMP57]]
+; DEFAULT-NEXT:    [[TMP59:%.*]] = lshr <vscale x 4 x i16> [[TMP58]], trunc (<vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i16>)
+; DEFAULT-NEXT:    [[TMP60:%.*]] = trunc <vscale x 4 x i16> [[TMP59]] to <vscale x 4 x i8>
+; DEFAULT-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP52]]
+; DEFAULT-NEXT:    [[TMP62:%.*]] = getelementptr i8, ptr [[TMP61]], i32 0
+; DEFAULT-NEXT:    store <vscale x 4 x i8> [[TMP60]], ptr [[TMP62]], align 1
+; DEFAULT-NEXT:    [[INDEX_NEXT12]] = add nuw i64 [[INDEX10]], [[TMP50]]
+; DEFAULT-NEXT:    [[TMP63:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC6]]
+; DEFAULT-NEXT:    br i1 [[TMP63]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT:       vec.epilog.middle.block:
+; DEFAULT-NEXT:    [[CMP_N7:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC6]]
+; DEFAULT-NEXT:    br i1 [[CMP_N7]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; DEFAULT:       vec.epilog.scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; DEFAULT-NEXT:    br label [[LOOP:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
+; DEFAULT-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; DEFAULT-NEXT:    [[L_EXT:%.*]] = zext i8 [[L]] to i32
+; DEFAULT-NEXT:    [[MUL16_US:%.*]] = mul i32 [[L_EXT]], [[X]]
+; DEFAULT-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; DEFAULT-NEXT:    [[CONV25_US:%.*]] = zext i8 [[L]] to i32
+; DEFAULT-NEXT:    [[ADD34_US:%.*]] = or i32 [[MUL16_US]], [[CONV25_US]]
+; DEFAULT-NEXT:    [[SHR35_US:%.*]] = lshr i32 [[ADD34_US]], 1
+; DEFAULT-NEXT:    [[CONV36_US:%.*]] = trunc i32 [[SHR35_US]] to i8
+; DEFAULT-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; DEFAULT-NEXT:    store i8 [[CONV36_US]], ptr [[GEP_DST]], align 1
+; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @iv_casts(
+; PRED-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; PRED-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; PRED:       vector.memcheck:
+; PRED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; PRED-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
+; PRED-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; PRED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; PRED-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP8]]
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; PRED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 8
+; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], [[TMP12]]
+; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], [[TMP12]]
+; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[X]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP16:%.*]] = trunc <vscale x 8 x i32> [[BROADCAST_SPLAT]] to <vscale x 8 x i16>
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 0
+; PRED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
+; PRED-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP18]], i32 0
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP19]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
+; PRED-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
+; PRED-NEXT:    [[TMP21:%.*]] = mul <vscale x 8 x i16> [[TMP20]], [[TMP16]]
+; PRED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 8 x i16>
+; PRED-NEXT:    [[TMP23:%.*]] = or <vscale x 8 x i16> [[TMP21]], [[TMP22]]
+; PRED-NEXT:    [[TMP24:%.*]] = lshr <vscale x 8 x i16> [[TMP23]], trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
+; PRED-NEXT:    [[TMP25:%.*]] = trunc <vscale x 8 x i16> [[TMP24]] to <vscale x 8 x i8>
+; PRED-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
+; PRED-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
+; PRED-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP25]], ptr [[TMP27]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP15]])
+; PRED-NEXT:    [[TMP28:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 8 x i1> [[TMP28]], i32 0
+; PRED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; PRED-NEXT:    br label [[LOOP:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
+; PRED-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; PRED-NEXT:    [[L_EXT:%.*]] = zext i8 [[L]] to i32
+; PRED-NEXT:    [[MUL16_US:%.*]] = mul i32 [[L_EXT]], [[X]]
+; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; PRED-NEXT:    [[CONV25_US:%.*]] = zext i8 [[L]] to i32
+; PRED-NEXT:    [[ADD34_US:%.*]] = or i32 [[MUL16_US]], [[CONV25_US]]
+; PRED-NEXT:    [[SHR35_US:%.*]] = lshr i32 [[ADD34_US]], 1
+; PRED-NEXT:    [[CONV36_US:%.*]] = trunc i32 [[SHR35_US]] to i8
+; PRED-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; PRED-NEXT:    store i8 [[CONV36_US]], ptr [[GEP_DST]], align 1
+; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src  = getelementptr i8, ptr %src, i64 %iv
+  %l = load i8, ptr %gep.src, align 1
+  %l.ext = zext i8 %l to i32
+  %mul = mul i32 %l.ext, %x
+  %iv.next = add i64 %iv, 1
+  %l.ext.2 = zext i8 %l to i32
+  %or = or i32 %mul, %l.ext.2
+  %lshr  = lshr i32 %or, 1
+  %trunc = trunc i32 %lshr to i8
+  %gep.dst = getelementptr i8, ptr %dst, i64 %iv
+  store i8 %trunc, ptr %gep.dst, align 1
+  %ec = icmp eq i64 %iv, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
+; DEFAULT-LABEL: define void @iv_trunc(
+; DEFAULT-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[MUL_X:%.*]] = add i32 [[X]], 1
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; DEFAULT:       vector.scevcheck:
+; DEFAULT-NEXT:    [[TMP1:%.*]] = sub i32 -1, [[X]]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[MUL_X]], 0
+; DEFAULT-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[MUL_X]]
+; DEFAULT-NEXT:    [[TMP4:%.*]] = trunc i64 [[N]] to i32
+; DEFAULT-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]])
+; DEFAULT-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; DEFAULT-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; DEFAULT-NEXT:    [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0
+; DEFAULT-NEXT:    [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false
+; DEFAULT-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; DEFAULT-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295
+; DEFAULT-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[MUL_X]], 0
+; DEFAULT-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]]
+; DEFAULT-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
+; DEFAULT-NEXT:    br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP13:%.*]] = trunc i64 [[INDEX]] to i32
+; DEFAULT-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], 0
+; DEFAULT-NEXT:    [[TMP15:%.*]] = add i32 [[TMP13]], 1
+; DEFAULT-NEXT:    [[TMP16:%.*]] = mul i32 [[MUL_X]], [[TMP14]]
+; DEFAULT-NEXT:    [[TMP17:%.*]] = mul i32 [[MUL_X]], [[TMP15]]
+; DEFAULT-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP16]] to i64
+; DEFAULT-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP17]] to i64
+; DEFAULT-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP18]]
+; DEFAULT-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]]
+; DEFAULT-NEXT:    store i32 1, ptr [[TMP20]], align 4
+; DEFAULT-NEXT:    store i32 1, ptr [[TMP21]], align 4
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; DEFAULT-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
+; DEFAULT:       for.body:
+; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DEFAULT-NEXT:    [[TRUNC_IV:%.*]] = trunc i64 [[IV]] to i32
+; DEFAULT-NEXT:    [[ADD_I:%.*]] = mul i32 [[MUL_X]], [[TRUNC_IV]]
+; DEFAULT-NEXT:    [[IV_MUL:%.*]] = zext i32 [[ADD_I]] to i64
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV_MUL]]
+; DEFAULT-NEXT:    store i32 1, ptr [[GEP]], align 4
+; DEFAULT-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @iv_trunc(
+; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    [[MUL_X:%.*]] = add i32 [[X]], 1
+; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; PRED:       vector.scevcheck:
+; PRED-NEXT:    [[TMP1:%.*]] = sub i32 -1, [[X]]
+; PRED-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[MUL_X]], 0
+; PRED-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[MUL_X]]
+; PRED-NEXT:    [[TMP4:%.*]] = trunc i64 [[N]] to i32
+; PRED-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]])
+; PRED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; PRED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; PRED-NEXT:    [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]]
+; PRED-NEXT:    [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0
+; PRED-NEXT:    [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false
+; PRED-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; PRED-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295
+; PRED-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[MUL_X]], 0
+; PRED-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]]
+; PRED-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
+; PRED-NEXT:    br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 1
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], 2
+; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 2
+; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[MUL_X]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
+; PRED-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
+; PRED-NEXT:    [[TMP16:%.*]] = mul <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
+; PRED-NEXT:    [[TMP17:%.*]] = zext <2 x i32> [[TMP16]] to <2 x i64>
+; PRED-NEXT:    [[TMP18:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; PRED-NEXT:    br i1 [[TMP18]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; PRED:       pred.store.if:
+; PRED-NEXT:    [[TMP19:%.*]] = extractelement <2 x i64> [[TMP17]], i32 0
+; PRED-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]]
+; PRED-NEXT:    store i32 1, ptr [[TMP20]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; PRED:       pred.store.continue:
+; PRED-NEXT:    [[TMP21:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; PRED-NEXT:    br i1 [[TMP21]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; PRED:       pred.store.if1:
+; PRED-NEXT:    [[TMP22:%.*]] = extractelement <2 x i64> [[TMP17]], i32 1
+; PRED-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP22]]
+; PRED-NEXT:    store i32 1, ptr [[TMP23]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; PRED:       pred.store.continue2:
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP15]])
+; PRED-NEXT:    [[TMP24:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true>
+; PRED-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; PRED-NEXT:    [[TMP25:%.*]] = extractelement <2 x i1> [[TMP24]], i32 0
+; PRED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; PRED-NEXT:    br label [[FOR_BODY:%.*]]
+; PRED:       for.body:
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; PRED-NEXT:    [[TRUNC_IV:%.*]] = trunc i64 [[IV]] to i32
+; PRED-NEXT:    [[ADD_I:%.*]] = mul i32 [[MUL_X]], [[TRUNC_IV]]
+; PRED-NEXT:    [[IV_MUL:%.*]] = zext i32 [[ADD_I]] to i64
+; PRED-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV_MUL]]
+; PRED-NEXT:    store i32 1, ptr [[GEP]], align 4
+; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  %mul.x = add i32 %x, 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %trunc.iv = trunc i64 %iv to i32
+  %add.i = mul i32 %mul.x, %trunc.iv
+  %iv.mul = zext i32 %add.i to i64
+  %gep = getelementptr i32, ptr %dst, i64 %iv.mul
+  store i32 1, ptr %gep, align 4
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, %N
+  br i1 %ec, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
+; DEFAULT-LABEL: define void @trunc_ivs_and_store(
+; DEFAULT-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[MUL:%.*]] = mul i32 [[X]], [[X]]
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; DEFAULT:       vector.scevcheck:
+; DEFAULT-NEXT:    [[TMP1:%.*]] = mul i32 [[X]], [[X]]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = sub i32 0, [[TMP1]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = icmp slt i32 [[MUL]], 0
+; DEFAULT-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 [[MUL]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = trunc i64 [[N]] to i32
+; DEFAULT-NEXT:    [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP4]], i32 [[TMP5]])
+; DEFAULT-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0
+; DEFAULT-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1
+; DEFAULT-NEXT:    [[TMP6:%.*]] = sub i32 0, [[MUL_RESULT]]
+; DEFAULT-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], 0
+; DEFAULT-NEXT:    [[TMP8:%.*]] = select i1 [[TMP3]], i1 [[TMP7]], i1 false
+; DEFAULT-NEXT:    [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]]
+; DEFAULT-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[N]], 4294967295
+; DEFAULT-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[MUL]], 0
+; DEFAULT-NEXT:    [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]]
+; DEFAULT-NEXT:    [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]]
+; DEFAULT-NEXT:    br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
+; DEFAULT-NEXT:    [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 0
+; DEFAULT-NEXT:    [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 1
+; DEFAULT-NEXT:    [[TMP16:%.*]] = trunc i64 [[INDEX]] to i32
+; DEFAULT-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], 0
+; DEFAULT-NEXT:    [[TMP18:%.*]] = add i32 [[TMP16]], 1
+; DEFAULT-NEXT:    [[TMP19:%.*]] = mul i32 [[MUL]], [[TMP17]]
+; DEFAULT-NEXT:    [[TMP20:%.*]] = mul i32 [[MUL]], [[TMP18]]
+; DEFAULT-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP19]] to i64
+; DEFAULT-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP20]] to i64
+; DEFAULT-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP21]]
+; DEFAULT-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP22]]
+; DEFAULT-NEXT:    store i32 [[TMP14]], ptr [[TMP23]], align 4
+; DEFAULT-NEXT:    store i32 [[TMP15]], ptr [[TMP24]], align 4
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; DEFAULT-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    br label [[LOOP:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[IV_1_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32
+; DEFAULT-NEXT:    [[IV_1_MUL:%.*]] = mul i32 [[MUL]], [[IV_1_TRUNC]]
+; DEFAULT-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
+; DEFAULT-NEXT:    [[MUL_EXT:%.*]] = zext i32 [[IV_1_MUL]] to i64
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[MUL_EXT]]
+; DEFAULT-NEXT:    store i32 [[IV_2]], ptr [[GEP]], align 4
+; DEFAULT-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_1]], 1
+; DEFAULT-NEXT:    [[EXITCOND_3_NOT:%.*]] = icmp eq i64 [[IV_1]], [[N]]
+; DEFAULT-NEXT:    br i1 [[EXITCOND_3_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @trunc_ivs_and_store(
+; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    [[MUL:%.*]] = mul i32 [[X]], [[X]]
+; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; PRED:       vector.scevcheck:
+; PRED-NEXT:    [[TMP1:%.*]] = mul i32 [[X]], [[X]]
+; PRED-NEXT:    [[TMP2:%.*]] = sub i32 0, [[TMP1]]
+; PRED-NEXT:    [[TMP3:%.*]] = icmp slt i32 [[MUL]], 0
+; PRED-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 [[MUL]]
+; PRED-NEXT:    [[TMP5:%.*]] = trunc i64 [[N]] to i32
+; PRED-NEXT:    [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP4]], i32 [[TMP5]])
+; PRED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0
+; PRED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1
+; PRED-NEXT:    [[TMP6:%.*]] = sub i32 0, [[MUL_RESULT]]
+; PRED-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], 0
+; PRED-NEXT:    [[TMP8:%.*]] = select i1 [[TMP3]], i1 [[TMP7]], i1 false
+; PRED-NEXT:    [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]]
+; PRED-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[N]], 4294967295
+; PRED-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[MUL]], 0
+; PRED-NEXT:    [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]]
+; PRED-NEXT:    [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]]
+; PRED-NEXT:    br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; PRED-NEXT:    [[TMP14:%.*]] = sub i64 [[TMP0]], 4
+; PRED-NEXT:    [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 4
+; PRED-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ]
+; PRED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ]
+; PRED-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
+; PRED-NEXT:    [[TMP17:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
+; PRED-NEXT:    [[TMP18:%.*]] = zext <4 x i32> [[TMP17]] to <4 x i64>
+; PRED-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; PRED-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; PRED:       pred.store.if:
+; PRED-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP18]], i32 0
+; PRED-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP20]]
+; PRED-NEXT:    [[TMP22:%.*]] = add i32 [[OFFSET_IDX]], 0
+; PRED-NEXT:    store i32 [[TMP22]], ptr [[TMP21]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; PRED:       pred.store.continue:
+; PRED-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; PRED-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; PRED:       pred.store.if3:
+; PRED-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP18]], i32 1
+; PRED-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP24]]
+; PRED-NEXT:    [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 1
+; PRED-NEXT:    store i32 [[TMP26]], ptr [[TMP25]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; PRED:       pred.store.continue4:
+; PRED-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
+; PRED-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; PRED:       pred.store.if5:
+; PRED-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2
+; PRED-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP28]]
+; PRED-NEXT:    [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], 2
+; PRED-NEXT:    store i32 [[TMP30]], ptr [[TMP29]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; PRED:       pred.store.continue6:
+; PRED-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
+; PRED-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]]
+; PRED:       pred.store.if7:
+; PRED-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP18]], i32 3
+; PRED-NEXT:    [[TMP33:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP32]]
+; PRED-NEXT:    [[TMP34:%.*]] = add i32 [[OFFSET_IDX]], 3
+; PRED-NEXT:    store i32 [[TMP34]], ptr [[TMP33]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; PRED:       pred.store.continue8:
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP16]])
+; PRED-NEXT:    [[TMP35:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true>
+; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; PRED-NEXT:    [[TMP36:%.*]] = extractelement <4 x i1> [[TMP35]], i32 0
+; PRED-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; PRED-NEXT:    br label [[LOOP:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[IV_1_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32
+; PRED-NEXT:    [[IV_1_MUL:%.*]] = mul i32 [[MUL]], [[IV_1_TRUNC]]
+; PRED-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
+; PRED-NEXT:    [[MUL_EXT:%.*]] = zext i32 [[IV_1_MUL]] to i64
+; PRED-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[MUL_EXT]]
+; PRED-NEXT:    store i32 [[IV_2]], ptr [[GEP]], align 4
+; PRED-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_1]], 1
+; PRED-NEXT:    [[EXITCOND_3_NOT:%.*]] = icmp eq i64 [[IV_1]], [[N]]
+; PRED-NEXT:    br i1 [[EXITCOND_3_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  %mul = mul i32 %x, %x
+  br label %loop
+
+loop:
+  %iv.1 = phi i64 [ 0, %entry ], [ %iv.1.next, %loop ]
+  %iv.2 = phi i32 [ 0, %entry ], [ %iv.2.next, %loop ]
+  %iv.1.trunc = trunc i64 %iv.1 to i32
+  %iv.1.mul = mul i32 %mul, %iv.1.trunc
+  %iv.2.next = add i32 %iv.2, 1
+  %mul.ext = zext i32 %iv.1.mul to i64
+  %gep = getelementptr i32, ptr %dst, i64 %mul.ext
+  store i32 %iv.2, ptr %gep, align 4
+  %iv.1.next = add i64 %iv.1, 1
+  %exitcond.3.not = icmp eq i64 %iv.1, %N
+  br i1 %exitcond.3.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
+; DEFAULT-LABEL: define void @ivs_trunc_and_ext(
+; DEFAULT-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[ADD:%.*]] = add i32 [[X]], 1
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; DEFAULT:       vector.scevcheck:
+; DEFAULT-NEXT:    [[TMP1:%.*]] = sub i32 -1, [[X]]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[ADD]], 0
+; DEFAULT-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[ADD]]
+; DEFAULT-NEXT:    [[TMP4:%.*]] = trunc i64 [[N]] to i32
+; DEFAULT-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]])
+; DEFAULT-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; DEFAULT-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; DEFAULT-NEXT:    [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0
+; DEFAULT-NEXT:    [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false
+; DEFAULT-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; DEFAULT-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295
+; DEFAULT-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[ADD]], 0
+; DEFAULT-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]]
+; DEFAULT-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
+; DEFAULT-NEXT:    br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
+; DEFAULT-NEXT:    [[TMP13:%.*]] = add i32 [[OFFSET_IDX]], 0
+; DEFAULT-NEXT:    [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 1
+; DEFAULT-NEXT:    [[TMP15:%.*]] = trunc i64 [[INDEX]] to i32
+; DEFAULT-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 0
+; DEFAULT-NEXT:    [[TMP17:%.*]] = add i32 [[TMP15]], 1
+; DEFAULT-NEXT:    [[TMP18:%.*]] = mul i32 [[ADD]], [[TMP16]]
+; DEFAULT-NEXT:    [[TMP19:%.*]] = mul i32 [[ADD]], [[TMP17]]
+; DEFAULT-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP18]] to i64
+; DEFAULT-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP19]] to i64
+; DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP20]]
+; DEFAULT-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP21]]
+; DEFAULT-NEXT:    store i32 [[TMP13]], ptr [[TMP22]], align 4
+; DEFAULT-NEXT:    store i32 [[TMP14]], ptr [[TMP23]], align 4
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; DEFAULT-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    br label [[LOOP:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32
+; DEFAULT-NEXT:    [[IV_MUL:%.*]] = mul i32 [[ADD]], [[IV_TRUNC]]
+; DEFAULT-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
+; DEFAULT-NEXT:    [[EXT:%.*]] = zext i32 [[IV_MUL]] to i64
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[EXT]]
+; DEFAULT-NEXT:    store i32 [[IV_2]], ptr [[GEP]], align 4
+; DEFAULT-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_1]], 1
+; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_1]], [[N]]
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @ivs_trunc_and_ext(
+; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    [[ADD:%.*]] = add i32 [[X]], 1
+; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; PRED:       vector.scevcheck:
+; PRED-NEXT:    [[TMP1:%.*]] = sub i32 -1, [[X]]
+; PRED-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[ADD]], 0
+; PRED-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[ADD]]
+; PRED-NEXT:    [[TMP4:%.*]] = trunc i64 [[N]] to i32
+; PRED-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]])
+; PRED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; PRED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; PRED-NEXT:    [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]]
+; PRED-NEXT:    [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0
+; PRED-NEXT:    [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false
+; PRED-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
+; PRED-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295
+; PRED-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[ADD]], 0
+; PRED-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]]
+; PRED-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
+; PRED-NEXT:    br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], 4
+; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 4
+; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[ADD]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE7:%.*]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE7]] ]
+; PRED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE7]] ]
+; PRED-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
+; PRED-NEXT:    [[TMP16:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
+; PRED-NEXT:    [[TMP17:%.*]] = zext <4 x i32> [[TMP16]] to <4 x i64>
+; PRED-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; PRED-NEXT:    br i1 [[TMP18]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; PRED:       pred.store.if:
+; PRED-NEXT:    [[TMP19:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0
+; PRED-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]]
+; PRED-NEXT:    [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 0
+; PRED-NEXT:    store i32 [[TMP21]], ptr [[TMP20]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; PRED:       pred.store.continue:
+; PRED-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; PRED-NEXT:    br i1 [[TMP22]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]]
+; PRED:       pred.store.if2:
+; PRED-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
+; PRED-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP23]]
+; PRED-NEXT:    [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 1
+; PRED-NEXT:    store i32 [[TMP25]], ptr [[TMP24]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE3]]
+; PRED:       pred.store.continue3:
+; PRED-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
+; PRED-NEXT:    br i1 [[TMP26]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
+; PRED:       pred.store.if4:
+; PRED-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2
+; PRED-NEXT:    [[TMP28:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP27]]
+; PRED-NEXT:    [[TMP29:%.*]] = add i32 [[OFFSET_IDX]], 2
+; PRED-NEXT:    store i32 [[TMP29]], ptr [[TMP28]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE5]]
+; PRED:       pred.store.continue5:
+; PRED-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
+; PRED-NEXT:    br i1 [[TMP30]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]]
+; PRED:       pred.store.if6:
+; PRED-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
+; PRED-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP31]]
+; PRED-NEXT:    [[TMP33:%.*]] = add i32 [[OFFSET_IDX]], 3
+; PRED-NEXT:    store i32 [[TMP33]], ptr [[TMP32]], align 4
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE7]]
+; PRED:       pred.store.continue7:
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP15]])
+; PRED-NEXT:    [[TMP34:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true>
+; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; PRED-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[TMP34]], i32 0
+; PRED-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; PRED-NEXT:    br label [[LOOP:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32
+; PRED-NEXT:    [[IV_MUL:%.*]] = mul i32 [[ADD]], [[IV_TRUNC]]
+; PRED-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
+; PRED-NEXT:    [[EXT:%.*]] = zext i32 [[IV_MUL]] to i64
+; PRED-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[EXT]]
+; PRED-NEXT:    store i32 [[IV_2]], ptr [[GEP]], align 4
+; PRED-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_1]], 1
+; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_1]], [[N]]
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  %add = add i32 %x, 1
+  br label %loop
+
+loop:
+  %iv.1 = phi i64 [ 0, %entry ], [ %iv.1.next, %loop ]
+  %iv.2 = phi i32 [ 0, %entry ], [ %iv.2.next, %loop ]
+  %iv.trunc = trunc i64 %iv.1 to i32
+  %iv.mul = mul i32 %add, %iv.trunc
+  %iv.2.next = add i32 %iv.2, 1
+  %ext = zext i32 %iv.mul to i64
+  %gep = getelementptr i32, ptr %dst, i64 %ext
+  store i32 %iv.2, ptr %gep, align 4
+  %iv.1.next = add i64 %iv.1, 1
+  %ec = icmp eq i64 %iv.1, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
+; DEFAULT-LABEL: define void @exit_cond_zext_iv(
+; DEFAULT-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 2
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; DEFAULT:       vector.scevcheck:
+; DEFAULT-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[UMAX]], -1
+; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
+; DEFAULT-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP0]] to i32
+; DEFAULT-NEXT:    [[TMP3:%.*]] = add i32 1, [[TMP2]]
+; DEFAULT-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1
+; DEFAULT-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
+; DEFAULT-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; DEFAULT-NEXT:    br i1 [[TMP6]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX1]], 2
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX1]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 1
+; DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP7]], i32 2
+; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP8]], i32 2
+; DEFAULT-NEXT:    store i32 0, ptr [[TMP9]], align 8
+; DEFAULT-NEXT:    store i32 0, ptr [[TMP10]], align 8
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; DEFAULT-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    br label [[LOOP:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[IV_CONV:%.*]] = phi i64 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[IV_EXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[IV_CONV]], i32 2
+; DEFAULT-NEXT:    store i32 0, ptr [[GEP]], align 8
+; DEFAULT-NEXT:    [[IV_1_NEXT]] = add i32 [[IV_1]], 1
+; DEFAULT-NEXT:    [[IV_EXT]] = zext i32 [[IV_1_NEXT]] to i64
+; DEFAULT-NEXT:    [[C:%.*]] = icmp ult i64 [[IV_EXT]], [[N]]
+; DEFAULT-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP12:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @exit_cond_zext_iv(
+; PRED-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; PRED:       vector.scevcheck:
+; PRED-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[UMAX]], -1
+; PRED-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
+; PRED-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP0]] to i32
+; PRED-NEXT:    [[TMP3:%.*]] = add i32 1, [[TMP2]]
+; PRED-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1
+; PRED-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
+; PRED-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; PRED-NEXT:    br i1 [[TMP6]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX1]], 1
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; PRED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX1]], 1
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT3]], <2 x i64> poison, <2 x i32> zeroinitializer
+; PRED-NEXT:    br label [[LOOP:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; PRED-NEXT:    [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1>
+; PRED-NEXT:    [[TMP7:%.*]] = icmp ule <2 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]]
+; PRED-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; PRED-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; PRED:       pred.store.if:
+; PRED-NEXT:    [[IV_CONV:%.*]] = add i64 [[INDEX]], 0
+; PRED-NEXT:    [[GEP:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[IV_CONV]], i32 2
+; PRED-NEXT:    store i32 0, ptr [[GEP]], align 8
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; PRED:       pred.store.continue:
+; PRED-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; PRED-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
+; PRED:       pred.store.if5:
+; PRED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 1
+; PRED-NEXT:    [[TMP13:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[TMP12]], i32 2
+; PRED-NEXT:    store i32 0, ptr [[TMP13]], align 8
+; PRED-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; PRED:       pred.store.continue6:
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; PRED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; PRED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; PRED-NEXT:    br label [[LOOP1:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP1]] ]
+; PRED-NEXT:    [[IV_CONV1:%.*]] = phi i64 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[IV_EXT:%.*]], [[LOOP1]] ]
+; PRED-NEXT:    [[GEP1:%.*]] = getelementptr { [100 x i32], i32, i32 }, ptr [[DST]], i64 [[IV_CONV1]], i32 2
+; PRED-NEXT:    store i32 0, ptr [[GEP1]], align 8
+; PRED-NEXT:    [[IV_1_NEXT]] = add i32 [[IV_1]], 1
+; PRED-NEXT:    [[IV_EXT]] = zext i32 [[IV_1_NEXT]] to i64
+; PRED-NEXT:    [[C:%.*]] = icmp ult i64 [[IV_EXT]], [[N]]
+; PRED-NEXT:    br i1 [[C]], label [[LOOP1]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %loop ]
+  %iv.conv = phi i64 [ 0, %entry ], [ %iv.ext, %loop ]
+  %gep = getelementptr {[100 x i32], i32, i32}, ptr %dst, i64 %iv.conv, i32 2
+  store i32 0, ptr %gep, align 8
+  %iv.1.next = add i32 %iv.1, 1
+  %iv.ext = zext i32 %iv.1.next to i64
+  %c = icmp ult i64 %iv.ext, %N
+  br i1 %c, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve" }
+
+;.
+; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; DEFAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
+; DEFAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]]}
+; DEFAULT: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]]}
+; DEFAULT: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]]}
+; DEFAULT: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]]}
+;.
+; PRED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; PRED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; PRED: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; PRED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; PRED: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; PRED: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; PRED: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
+; PRED: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; PRED: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]}
+; PRED: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; PRED: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
index 931ab4f77618..4d9c850abdf3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
@@ -9,37 +9,37 @@ define i32 @multi_exit_iv_uniform(i32 %a, i64 %N, ptr %dst) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 2147483648)
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[UMIN]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 4, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 8, i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = zext <2 x i32> [[BROADCAST_SPLAT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = zext <4 x i32> [[BROADCAST_SPLAT]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP5]], i32 2
-; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[TMP8]], align 8
-; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10]] = add <2 x i32> [[VEC_PHI]], <i32 -1, i32 -1>
-; CHECK-NEXT:    [[TMP11]] = add <2 x i32> [[VEC_PHI1]], <i32 -1, i32 -1>
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP5]], i32 4
+; CHECK-NEXT:    store <4 x i64> [[TMP7]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP7]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP10]] = add <4 x i32> [[VEC_PHI]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP11]] = add <4 x i32> [[VEC_PHI1]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <2 x i32> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@@ -285,6 +285,264 @@ loop:
 exit:
   ret void
 }
+
+define i64 @test_ptr_ivs_and_widened_ivs(ptr %src, i32 %N) {
+; DEFAULT-LABEL: define i64 @test_ptr_ivs_and_widened_ivs(
+; DEFAULT-SAME: ptr [[SRC:%.*]], i32 [[N:%.*]]) {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; DEFAULT-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; DEFAULT-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 4
+; DEFAULT-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; DEFAULT-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
+; DEFAULT-NEXT:    [[IND_END3:%.*]] = trunc i64 [[N_VEC]] to i32
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; DEFAULT-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
+; DEFAULT-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 16
+; DEFAULT-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP4]]
+; DEFAULT-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP5]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 0
+; DEFAULT-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
+; DEFAULT-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; DEFAULT-NEXT:    [[TMP8:%.*]] = xor <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; DEFAULT-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[WIDE_LOAD7]], <i32 1, i32 1, i32 1, i32 1>
+; DEFAULT-NEXT:    [[TMP10:%.*]] = zext <4 x i32> [[TMP8]] to <4 x i64>
+; DEFAULT-NEXT:    [[TMP11:%.*]] = zext <4 x i32> [[TMP9]] to <4 x i64>
+; DEFAULT-NEXT:    [[TMP12:%.*]] = zext <4 x i32> [[VEC_IND]] to <4 x i64>
+; DEFAULT-NEXT:    [[TMP13:%.*]] = zext <4 x i32> [[STEP_ADD]] to <4 x i64>
+; DEFAULT-NEXT:    [[TMP14:%.*]] = shl <4 x i64> [[TMP10]], [[TMP12]]
+; DEFAULT-NEXT:    [[TMP15]] = shl <4 x i64> [[TMP11]], [[TMP13]]
+; DEFAULT-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP14]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; DEFAULT-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i64> [[TMP14]], <4 x i64> [[TMP15]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
+; DEFAULT-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; DEFAULT-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP15]], i32 3
+; DEFAULT-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP15]], i32 2
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; DEFAULT-NEXT:    br label [[LOOP:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[SHL:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[L:%.*]] = load i32, ptr [[PTR_IV]], align 4
+; DEFAULT-NEXT:    [[NOT:%.*]] = xor i32 [[L]], 1
+; DEFAULT-NEXT:    [[NOT_EXT:%.*]] = zext i32 [[NOT]] to i64
+; DEFAULT-NEXT:    [[IV_EXT:%.*]] = zext i32 [[IV_1]] to i64
+; DEFAULT-NEXT:    [[SHL]] = shl i64 [[NOT_EXT]], [[IV_EXT]]
+; DEFAULT-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 4
+; DEFAULT-NEXT:    [[IV_1_NEXT]] = add i32 [[IV_1]], 1
+; DEFAULT-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
+; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_2_NEXT]], [[N]]
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    [[P_LCSSA:%.*]] = phi i64 [ [[SCALAR_RECUR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
+; DEFAULT-NEXT:    ret i64 [[P_LCSSA]]
+;
+; PRED-LABEL: define i64 @test_ptr_ivs_and_widened_ivs(
+; PRED-SAME: ptr [[SRC:%.*]], i32 [[N:%.*]]) {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; PRED-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; PRED-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; PRED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
+; PRED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 4
+; PRED-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; PRED-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
+; PRED-NEXT:    [[IND_END3:%.*]] = trunc i64 [[N_VEC]] to i32
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; PRED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; PRED-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
+; PRED-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 16
+; PRED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP4]]
+; PRED-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP5]]
+; PRED-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 0
+; PRED-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4
+; PRED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
+; PRED-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; PRED-NEXT:    [[TMP8:%.*]] = xor <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; PRED-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[WIDE_LOAD7]], <i32 1, i32 1, i32 1, i32 1>
+; PRED-NEXT:    [[TMP10:%.*]] = zext <4 x i32> [[TMP8]] to <4 x i64>
+; PRED-NEXT:    [[TMP11:%.*]] = zext <4 x i32> [[TMP9]] to <4 x i64>
+; PRED-NEXT:    [[TMP12:%.*]] = zext <4 x i32> [[VEC_IND]] to <4 x i64>
+; PRED-NEXT:    [[TMP13:%.*]] = zext <4 x i32> [[STEP_ADD]] to <4 x i64>
+; PRED-NEXT:    [[TMP14:%.*]] = shl <4 x i64> [[TMP10]], [[TMP12]]
+; PRED-NEXT:    [[TMP15]] = shl <4 x i64> [[TMP11]], [[TMP13]]
+; PRED-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP14]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; PRED-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i64> [[TMP14]], <4 x i64> [[TMP15]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; PRED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
+; PRED-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; PRED-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; PRED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP15]], i32 3
+; PRED-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP15]], i32 2
+; PRED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SRC]], [[ENTRY]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; PRED-NEXT:    br label [[LOOP:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[SHL:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[L:%.*]] = load i32, ptr [[PTR_IV]], align 4
+; PRED-NEXT:    [[NOT:%.*]] = xor i32 [[L]], 1
+; PRED-NEXT:    [[NOT_EXT:%.*]] = zext i32 [[NOT]] to i64
+; PRED-NEXT:    [[IV_EXT:%.*]] = zext i32 [[IV_1]] to i64
+; PRED-NEXT:    [[SHL]] = shl i64 [[NOT_EXT]], [[IV_EXT]]
+; PRED-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 4
+; PRED-NEXT:    [[IV_1_NEXT]] = add i32 [[IV_1]], 1
+; PRED-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
+; PRED-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_2_NEXT]], [[N]]
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    [[P_LCSSA:%.*]] = phi i64 [ [[SCALAR_RECUR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
+; PRED-NEXT:    ret i64 [[P_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %p = phi i64 [ 0, %entry ], [ %shl, %loop ]
+  %ptr.iv = phi ptr [ %src, %entry ], [ %ptr.iv.next, %loop ]
+  %iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %loop ]
+  %iv.2 = phi i32 [ 0, %entry ], [ %iv.2.next, %loop ]
+  %l = load i32, ptr %ptr.iv, align 4
+  %not = xor i32 %l, 1
+  %not.ext = zext i32 %not to i64
+  %iv.ext = zext i32 %iv.1 to i64
+  %shl = shl i64 %not.ext , %iv.ext
+  %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 4
+  %iv.1.next = add i32 %iv.1, 1
+  %iv.2.next = add i32 %iv.2, 1
+  %ec = icmp eq i32 %iv.2.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i64 %p
+}
+
+define void @zext_iv_increment(ptr %dst, i64 %N) {
+; DEFAULT-LABEL: define void @zext_iv_increment(
+; DEFAULT-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 2
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; DEFAULT:       vector.scevcheck:
+; DEFAULT-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[UMAX]], -1
+; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
+; DEFAULT-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP0]] to i32
+; DEFAULT-NEXT:    [[TMP3:%.*]] = add i32 1, [[TMP2]]
+; DEFAULT-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1
+; DEFAULT-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
+; DEFAULT-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; DEFAULT-NEXT:    br i1 [[TMP6]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX1]], 2
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX1]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 1
+; DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr { i32, i32, i32 }, ptr [[DST]], i64 [[TMP7]], i32 2
+; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr { i32, i32, i32 }, ptr [[DST]], i64 [[TMP8]], i32 2
+; DEFAULT-NEXT:    store i32 0, ptr [[TMP9]], align 8
+; DEFAULT-NEXT:    store i32 0, ptr [[TMP10]], align 8
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; DEFAULT-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP173_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    br label [[FOR_BODY174:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[I167_0800:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[INC179:%.*]], [[FOR_BODY174]] ]
+; DEFAULT-NEXT:    [[CONV169801:%.*]] = phi i64 [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[CONV169:%.*]], [[FOR_BODY174]] ]
+; DEFAULT-NEXT:    [[PATCH_INDEX:%.*]] = getelementptr { i32, i32, i32 }, ptr [[DST]], i64 [[CONV169801]], i32 2
+; DEFAULT-NEXT:    store i32 0, ptr [[PATCH_INDEX]], align 8
+; DEFAULT-NEXT:    [[INC179]] = add i32 [[I167_0800]], 1
+; DEFAULT-NEXT:    [[CONV169]] = zext i32 [[INC179]] to i64
+; DEFAULT-NEXT:    [[CMP172:%.*]] = icmp ult i64 [[CONV169]], [[N]]
+; DEFAULT-NEXT:    br i1 [[CMP172]], label [[FOR_BODY174]], label [[FOR_COND_CLEANUP173_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @zext_iv_increment(
+; PRED-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    br label [[FOR_BODY174:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[I167_0800:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC179:%.*]], [[FOR_BODY174]] ]
+; PRED-NEXT:    [[CONV169801:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[CONV169:%.*]], [[FOR_BODY174]] ]
+; PRED-NEXT:    [[PATCH_INDEX:%.*]] = getelementptr { i32, i32, i32 }, ptr [[DST]], i64 [[CONV169801]], i32 2
+; PRED-NEXT:    store i32 0, ptr [[PATCH_INDEX]], align 8
+; PRED-NEXT:    [[INC179]] = add i32 [[I167_0800]], 1
+; PRED-NEXT:    [[CONV169]] = zext i32 [[INC179]] to i64
+; PRED-NEXT:    [[CMP172:%.*]] = icmp ult i64 [[CONV169]], [[N]]
+; PRED-NEXT:    br i1 [[CMP172]], label [[FOR_BODY174]], label [[FOR_COND_CLEANUP173_LOOPEXIT:%.*]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.wide = phi i64 [ 0, %entry ], [ %iv.next.ext, %loop ]
+  %patch_index = getelementptr { i32, i32, i32 }, ptr %dst, i64 %iv.wide, i32 2
+  store i32 0, ptr %patch_index, align 8
+  %iv.next = add i32 %iv, 1
+  %iv.next.ext = zext i32 %iv.next to i64
+  %ec = icmp ult i64 %iv.next.ext, %N
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
new file mode 100644
index 000000000000..c24c1a38177d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -0,0 +1,482 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p loop-vectorize -S %s | FileCheck --check-prefixes=DEFAULT %s
+; RUN: opt -p loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S %s | FileCheck --check-prefixes=PRED %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-macosx14.0.0"
+
+define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2) #0 {
+; DEFAULT-LABEL: define i32 @chained_recurrences(
+; DEFAULT-SAME: i32 [[X:%.*]], i64 [[Y:%.*]], ptr [[SRC_1:%.*]], i32 [[Z:%.*]], ptr [[SRC_2:%.*]]) #[[ATTR0:[0-9]+]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[Y]], 1
+; DEFAULT-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; DEFAULT-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
+; DEFAULT-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 4
+; DEFAULT-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP8]], 1
+; DEFAULT-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP9]]
+; DEFAULT-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vscale.i32()
+; DEFAULT-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP10]], 4
+; DEFAULT-NEXT:    [[TMP12:%.*]] = sub i32 [[TMP11]], 1
+; DEFAULT-NEXT:    [[VECTOR_RECUR_INIT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP12]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT6]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT8]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT5:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT1]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP57:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP58:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP13:%.*]] = add i64 [[Y]], 1
+; DEFAULT-NEXT:    [[TMP14:%.*]] = add i64 [[Y]], 1
+; DEFAULT-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[SRC_1]], i64 [[TMP13]]
+; DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[SRC_1]], i64 [[TMP14]]
+; DEFAULT-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 4
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP17]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP18]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT5]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT4]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP19:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[BROADCAST_SPLAT]], i32 -1)
+; DEFAULT-NEXT:    [[TMP20]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x i32> [[BROADCAST_SPLAT5]], i32 -1)
+; DEFAULT-NEXT:    [[TMP21:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP19]], i32 -1)
+; DEFAULT-NEXT:    [[TMP22:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[TMP19]], <vscale x 4 x i32> [[TMP20]], i32 -1)
+; DEFAULT-NEXT:    [[TMP23:%.*]] = or <vscale x 4 x i32> [[TMP21]], [[BROADCAST_SPLAT7]]
+; DEFAULT-NEXT:    [[TMP24:%.*]] = or <vscale x 4 x i32> [[TMP22]], [[BROADCAST_SPLAT7]]
+; DEFAULT-NEXT:    [[TMP25:%.*]] = lshr <vscale x 4 x i32> [[BROADCAST_SPLAT7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP26:%.*]] = lshr <vscale x 4 x i32> [[BROADCAST_SPLAT7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP27:%.*]] = shl <vscale x 4 x i32> [[TMP23]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP28:%.*]] = shl <vscale x 4 x i32> [[TMP24]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP29:%.*]] = or <vscale x 4 x i32> [[TMP27]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP30:%.*]] = or <vscale x 4 x i32> [[TMP28]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP31:%.*]] = shl <vscale x 4 x i32> [[BROADCAST_SPLAT7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP32:%.*]] = shl <vscale x 4 x i32> [[BROADCAST_SPLAT7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP33:%.*]] = or <vscale x 4 x i32> [[TMP25]], [[TMP31]]
+; DEFAULT-NEXT:    [[TMP34:%.*]] = or <vscale x 4 x i32> [[TMP26]], [[TMP32]]
+; DEFAULT-NEXT:    [[TMP35:%.*]] = or <vscale x 4 x i32> [[TMP33]], [[TMP29]]
+; DEFAULT-NEXT:    [[TMP36:%.*]] = or <vscale x 4 x i32> [[TMP34]], [[TMP30]]
+; DEFAULT-NEXT:    [[TMP37:%.*]] = or <vscale x 4 x i32> [[TMP35]], [[BROADCAST_SPLAT7]]
+; DEFAULT-NEXT:    [[TMP38:%.*]] = or <vscale x 4 x i32> [[TMP36]], [[BROADCAST_SPLAT7]]
+; DEFAULT-NEXT:    [[TMP39:%.*]] = or <vscale x 4 x i32> [[BROADCAST_SPLAT9]], [[BROADCAST_SPLAT7]]
+; DEFAULT-NEXT:    [[TMP40:%.*]] = or <vscale x 4 x i32> [[BROADCAST_SPLAT9]], [[BROADCAST_SPLAT7]]
+; DEFAULT-NEXT:    [[TMP41:%.*]] = and <vscale x 4 x i32> [[TMP39]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP42:%.*]] = and <vscale x 4 x i32> [[TMP40]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP43:%.*]] = xor <vscale x 4 x i32> [[TMP41]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP44:%.*]] = xor <vscale x 4 x i32> [[TMP42]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP45:%.*]] = zext <vscale x 4 x i32> [[TMP43]] to <vscale x 4 x i64>
+; DEFAULT-NEXT:    [[TMP46:%.*]] = zext <vscale x 4 x i32> [[TMP44]] to <vscale x 4 x i64>
+; DEFAULT-NEXT:    [[TMP47:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP45]]
+; DEFAULT-NEXT:    [[TMP48:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP46]]
+; DEFAULT-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP47]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; DEFAULT-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP48]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; DEFAULT-NEXT:    [[TMP49:%.*]] = lshr <vscale x 4 x i32> [[TMP37]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP50:%.*]] = lshr <vscale x 4 x i32> [[TMP38]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP51:%.*]] = zext <vscale x 4 x i32> [[TMP49]] to <vscale x 4 x i64>
+; DEFAULT-NEXT:    [[TMP52:%.*]] = zext <vscale x 4 x i32> [[TMP50]] to <vscale x 4 x i64>
+; DEFAULT-NEXT:    [[TMP53:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP51]]
+; DEFAULT-NEXT:    [[TMP54:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP52]]
+; DEFAULT-NEXT:    [[WIDE_MASKED_GATHER11:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP53]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; DEFAULT-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP54]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; DEFAULT-NEXT:    [[TMP55:%.*]] = or <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], [[VEC_PHI]]
+; DEFAULT-NEXT:    [[TMP56:%.*]] = or <vscale x 4 x i32> [[WIDE_MASKED_GATHER10]], [[VEC_PHI3]]
+; DEFAULT-NEXT:    [[TMP57]] = or <vscale x 4 x i32> [[TMP55]], [[WIDE_MASKED_GATHER11]]
+; DEFAULT-NEXT:    [[TMP58]] = or <vscale x 4 x i32> [[TMP56]], [[WIDE_MASKED_GATHER12]]
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; DEFAULT-NEXT:    [[TMP59:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP59]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[BIN_RDX:%.*]] = or <vscale x 4 x i32> [[TMP58]], [[TMP57]]
+; DEFAULT-NEXT:    [[TMP60:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT:    [[TMP61:%.*]] = call i32 @llvm.vscale.i32()
+; DEFAULT-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], 4
+; DEFAULT-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP62]], 1
+; DEFAULT-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[BROADCAST_SPLAT5]], i32 [[TMP63]]
+; DEFAULT-NEXT:    [[TMP64:%.*]] = call i32 @llvm.vscale.i32()
+; DEFAULT-NEXT:    [[TMP65:%.*]] = mul i32 [[TMP64]], 4
+; DEFAULT-NEXT:    [[TMP66:%.*]] = sub i32 [[TMP65]], 1
+; DEFAULT-NEXT:    [[VECTOR_RECUR_EXTRACT13:%.*]] = extractelement <vscale x 4 x i32> [[TMP20]], i32 [[TMP66]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[SCALAR_RECUR_INIT14:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT13]], [[MIDDLE_BLOCK]] ]
+; DEFAULT-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; DEFAULT-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP60]], [[MIDDLE_BLOCK]] ]
+; DEFAULT-NEXT:    br label [[LOOP:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP68:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[SCALAR_RECUR15:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT14]], [[SCALAR_PH]] ], [ [[SCALAR_RECUR]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[SUM_RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_2:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[TMP67:%.*]] = add i64 [[Y]], 1
+; DEFAULT-NEXT:    [[GEP_1:%.*]] = getelementptr i32, ptr [[SRC_1]], i64 [[TMP67]]
+; DEFAULT-NEXT:    [[TMP68]] = load i32, ptr [[GEP_1]], align 4
+; DEFAULT-NEXT:    [[OR3:%.*]] = or i32 [[SCALAR_RECUR15]], [[X]]
+; DEFAULT-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; DEFAULT-NEXT:    [[SHR:%.*]] = lshr i32 [[X]], 1
+; DEFAULT-NEXT:    [[TMP69:%.*]] = shl i32 [[OR3]], 1
+; DEFAULT-NEXT:    [[TMP70:%.*]] = or i32 [[TMP69]], 2
+; DEFAULT-NEXT:    [[SHL19:%.*]] = shl i32 [[X]], 1
+; DEFAULT-NEXT:    [[TMP71:%.*]] = or i32 [[SHR]], [[SHL19]]
+; DEFAULT-NEXT:    [[TMP72:%.*]] = or i32 [[TMP71]], [[TMP70]]
+; DEFAULT-NEXT:    [[TMP73:%.*]] = or i32 [[TMP72]], [[X]]
+; DEFAULT-NEXT:    [[OR20:%.*]] = or i32 [[Z]], [[X]]
+; DEFAULT-NEXT:    [[NOT:%.*]] = and i32 [[OR20]], 1
+; DEFAULT-NEXT:    [[AND:%.*]] = xor i32 [[NOT]], 1
+; DEFAULT-NEXT:    [[IDX_EXT_1:%.*]] = zext i32 [[AND]] to i64
+; DEFAULT-NEXT:    [[GEP_2:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[IDX_EXT_1]]
+; DEFAULT-NEXT:    [[TMP74:%.*]] = load i32, ptr [[GEP_2]], align 4
+; DEFAULT-NEXT:    [[SHR24:%.*]] = lshr i32 [[TMP73]], 1
+; DEFAULT-NEXT:    [[IDX_EXT_2:%.*]] = zext i32 [[SHR24]] to i64
+; DEFAULT-NEXT:    [[GEP_3:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[IDX_EXT_2]]
+; DEFAULT-NEXT:    [[TMP75:%.*]] = load i32, ptr [[GEP_3]], align 4
+; DEFAULT-NEXT:    [[RED_1:%.*]] = or i32 [[TMP74]], [[SUM_RED]]
+; DEFAULT-NEXT:    [[RED_2]] = or i32 [[RED_1]], [[TMP75]]
+; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[Y]]
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    [[RED_2_LCSSA:%.*]] = phi i32 [ [[RED_2]], [[LOOP]] ], [ [[TMP60]], [[MIDDLE_BLOCK]] ]
+; DEFAULT-NEXT:    ret i32 [[RED_2_LCSSA]]
+;
+; PRED-LABEL: define i32 @chained_recurrences(
+; PRED-SAME: i32 [[X:%.*]], i64 [[Y:%.*]], ptr [[SRC_1:%.*]], i32 [[Z:%.*]], ptr [[SRC_2:%.*]]) #[[ATTR0:[0-9]+]] {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[Y]], 1
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; PRED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; PRED-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP5]]
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]]
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; PRED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; PRED-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP0]], [[TMP9]]
+; PRED-NEXT:    [[TMP11:%.*]] = icmp ugt i64 [[TMP0]], [[TMP9]]
+; PRED-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
+; PRED-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], 4
+; PRED-NEXT:    [[TMP15:%.*]] = sub i32 [[TMP14]], 1
+; PRED-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP15]]
+; PRED-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
+; PRED-NEXT:    [[TMP17:%.*]] = mul i32 [[TMP16]], 4
+; PRED-NEXT:    [[TMP18:%.*]] = sub i32 [[TMP17]], 1
+; PRED-NEXT:    [[VECTOR_RECUR_INIT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP18]]
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT1]], [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[TMP19:%.*]] = add i64 [[Y]], 1
+; PRED-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[SRC_1]], i64 [[TMP19]]
+; PRED-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP21]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP22]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[BROADCAST_SPLAT]], i32 -1)
+; PRED-NEXT:    [[TMP23:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP22]], i32 -1)
+; PRED-NEXT:    [[TMP24:%.*]] = or <vscale x 4 x i32> [[TMP23]], [[BROADCAST_SPLAT4]]
+; PRED-NEXT:    [[TMP25:%.*]] = lshr <vscale x 4 x i32> [[BROADCAST_SPLAT4]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP26:%.*]] = shl <vscale x 4 x i32> [[TMP24]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP27:%.*]] = or <vscale x 4 x i32> [[TMP26]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP28:%.*]] = shl <vscale x 4 x i32> [[BROADCAST_SPLAT4]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP29:%.*]] = or <vscale x 4 x i32> [[TMP25]], [[TMP28]]
+; PRED-NEXT:    [[TMP30:%.*]] = or <vscale x 4 x i32> [[TMP29]], [[TMP27]]
+; PRED-NEXT:    [[TMP31:%.*]] = or <vscale x 4 x i32> [[TMP30]], [[BROADCAST_SPLAT4]]
+; PRED-NEXT:    [[TMP32:%.*]] = or <vscale x 4 x i32> [[BROADCAST_SPLAT6]], [[BROADCAST_SPLAT4]]
+; PRED-NEXT:    [[TMP33:%.*]] = and <vscale x 4 x i32> [[TMP32]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP34:%.*]] = xor <vscale x 4 x i32> [[TMP33]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP35:%.*]] = zext <vscale x 4 x i32> [[TMP34]] to <vscale x 4 x i64>
+; PRED-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP35]]
+; PRED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP36]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; PRED-NEXT:    [[TMP37:%.*]] = lshr <vscale x 4 x i32> [[TMP31]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP38:%.*]] = zext <vscale x 4 x i32> [[TMP37]] to <vscale x 4 x i64>
+; PRED-NEXT:    [[TMP39:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP38]]
+; PRED-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP39]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; PRED-NEXT:    [[TMP40:%.*]] = or <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], [[VEC_PHI]]
+; PRED-NEXT:    [[TMP41:%.*]] = or <vscale x 4 x i32> [[TMP40]], [[WIDE_MASKED_GATHER7]]
+; PRED-NEXT:    [[TMP42]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP41]], <vscale x 4 x i32> [[VEC_PHI]]
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]])
+; PRED-NEXT:    [[TMP43:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP44:%.*]] = extractelement <vscale x 4 x i1> [[TMP43]], i32 0
+; PRED-NEXT:    br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    [[TMP45:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> [[TMP42]])
+; PRED-NEXT:    [[TMP46:%.*]] = call i32 @llvm.vscale.i32()
+; PRED-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], 4
+; PRED-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP47]], 1
+; PRED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[BROADCAST_SPLAT]], i32 [[TMP48]]
+; PRED-NEXT:    [[TMP49:%.*]] = call i32 @llvm.vscale.i32()
+; PRED-NEXT:    [[TMP50:%.*]] = mul i32 [[TMP49]], 4
+; PRED-NEXT:    [[TMP51:%.*]] = sub i32 [[TMP50]], 1
+; PRED-NEXT:    [[VECTOR_RECUR_EXTRACT8:%.*]] = extractelement <vscale x 4 x i32> [[TMP22]], i32 [[TMP51]]
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[SCALAR_RECUR_INIT9:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT8]], [[MIDDLE_BLOCK]] ]
+; PRED-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; PRED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP45]], [[MIDDLE_BLOCK]] ]
+; PRED-NEXT:    br label [[LOOP:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP53:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[SCALAR_RECUR10:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT9]], [[SCALAR_PH]] ], [ [[SCALAR_RECUR]], [[LOOP]] ]
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[SUM_RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_2:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[TMP52:%.*]] = add i64 [[Y]], 1
+; PRED-NEXT:    [[GEP_1:%.*]] = getelementptr i32, ptr [[SRC_1]], i64 [[TMP52]]
+; PRED-NEXT:    [[TMP53]] = load i32, ptr [[GEP_1]], align 4
+; PRED-NEXT:    [[OR3:%.*]] = or i32 [[SCALAR_RECUR10]], [[X]]
+; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; PRED-NEXT:    [[SHR:%.*]] = lshr i32 [[X]], 1
+; PRED-NEXT:    [[TMP54:%.*]] = shl i32 [[OR3]], 1
+; PRED-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 2
+; PRED-NEXT:    [[SHL19:%.*]] = shl i32 [[X]], 1
+; PRED-NEXT:    [[TMP56:%.*]] = or i32 [[SHR]], [[SHL19]]
+; PRED-NEXT:    [[TMP57:%.*]] = or i32 [[TMP56]], [[TMP55]]
+; PRED-NEXT:    [[TMP58:%.*]] = or i32 [[TMP57]], [[X]]
+; PRED-NEXT:    [[OR20:%.*]] = or i32 [[Z]], [[X]]
+; PRED-NEXT:    [[NOT:%.*]] = and i32 [[OR20]], 1
+; PRED-NEXT:    [[AND:%.*]] = xor i32 [[NOT]], 1
+; PRED-NEXT:    [[IDX_EXT_1:%.*]] = zext i32 [[AND]] to i64
+; PRED-NEXT:    [[GEP_2:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[IDX_EXT_1]]
+; PRED-NEXT:    [[TMP59:%.*]] = load i32, ptr [[GEP_2]], align 4
+; PRED-NEXT:    [[SHR24:%.*]] = lshr i32 [[TMP58]], 1
+; PRED-NEXT:    [[IDX_EXT_2:%.*]] = zext i32 [[SHR24]] to i64
+; PRED-NEXT:    [[GEP_3:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[IDX_EXT_2]]
+; PRED-NEXT:    [[TMP60:%.*]] = load i32, ptr [[GEP_3]], align 4
+; PRED-NEXT:    [[RED_1:%.*]] = or i32 [[TMP59]], [[SUM_RED]]
+; PRED-NEXT:    [[RED_2]] = or i32 [[RED_1]], [[TMP60]]
+; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[Y]]
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    [[RED_2_LCSSA:%.*]] = phi i32 [ [[RED_2]], [[LOOP]] ], [ [[TMP45]], [[MIDDLE_BLOCK]] ]
+; PRED-NEXT:    ret i32 [[RED_2_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %2 = phi i32 [ 0, %entry ], [ %5, %loop ]
+  %3 = phi i32 [ 0, %entry ], [ %2, %loop ]
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %sum.red = phi i32 [ 0, %entry ], [ %red.2, %loop ]
+  %4 = add i64 %y, 1
+  %gep.1 = getelementptr i32, ptr %src.1, i64 %4
+  %5 = load i32, ptr %gep.1, align 4
+  %or3 = or i32 %3, %x
+  %iv.next = add i64 %iv, 1
+  %shr = lshr i32 %x, 1
+  %6 = shl i32 %or3, 1
+  %7 = or i32 %6, 2
+  %shl19 = shl i32 %x, 1
+  %8 = or i32 %shr, %shl19
+  %9 = or i32 %8, %7
+  %10 = or i32 %9, %x
+  %or20 = or i32 %z, %x
+  %not = and i32 %or20, 1
+  %and = xor i32 %not, 1
+  %idx.ext.1 = zext i32 %and to i64
+  %gep.2 = getelementptr i32, ptr %src.2, i64 %idx.ext.1
+  %11 = load i32, ptr %gep.2, align 4
+  %shr24 = lshr i32 %10, 1
+  %idx.ext.2 = zext i32 %shr24 to i64
+  %gep.3 = getelementptr i32, ptr %src.2, i64 %idx.ext.2
+  %12 = load i32, ptr %gep.3, align 4
+  %red.1 = or i32 %11, %sum.red
+  %red.2 = or i32 %red.1, %12
+  %ec = icmp eq i64 %iv, %y
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %red.2
+}
+
+define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
+; DEFAULT-LABEL: define i16 @reduce_udiv(
+; DEFAULT-SAME: ptr [[SRC:%.*]], i16 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; DEFAULT-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[X]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; DEFAULT-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 0
+; DEFAULT-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 1
+; DEFAULT-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]]
+; DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP7]]
+; DEFAULT-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP12]]
+; DEFAULT-NEXT:    [[TMP15:%.*]] = getelementptr i16, ptr [[TMP13]], i32 0
+; DEFAULT-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr i16, ptr [[TMP13]], i64 [[TMP17]]
+; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP15]], align 2
+; DEFAULT-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i16>, ptr [[TMP18]], align 2
+; DEFAULT-NEXT:    [[TMP19:%.*]] = udiv <vscale x 4 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; DEFAULT-NEXT:    [[TMP20:%.*]] = udiv <vscale x 4 x i16> [[WIDE_LOAD2]], [[BROADCAST_SPLAT]]
+; DEFAULT-NEXT:    [[TMP21]] = or <vscale x 4 x i16> [[TMP19]], [[VEC_PHI]]
+; DEFAULT-NEXT:    [[TMP22]] = or <vscale x 4 x i16> [[TMP20]], [[VEC_PHI1]]
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; DEFAULT-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[BIN_RDX:%.*]] = or <vscale x 4 x i16> [[TMP22]], [[TMP21]]
+; DEFAULT-NEXT:    [[TMP24:%.*]] = call i16 @llvm.vector.reduce.or.nxv4i16(<vscale x 4 x i16> [[BIN_RDX]])
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, [[ENTRY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ]
+; DEFAULT-NEXT:    br label [[LOOP:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[RED:%.*]] = phi i16 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]]
+; DEFAULT-NEXT:    [[L:%.*]] = load i16, ptr [[GEP]], align 2
+; DEFAULT-NEXT:    [[DIV:%.*]] = udiv i16 [[L]], [[X]]
+; DEFAULT-NEXT:    [[RED_NEXT]] = or i16 [[DIV]], [[RED]]
+; DEFAULT-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i16 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ]
+; DEFAULT-NEXT:    ret i16 [[RED_NEXT_LCSSA]]
+;
+; PRED-LABEL: define i16 @reduce_udiv(
+; PRED-SAME: ptr [[SRC:%.*]], i16 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; PRED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; PRED-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP5]]
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]]
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; PRED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
+; PRED-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP0]], [[TMP9]]
+; PRED-NEXT:    [[TMP11:%.*]] = icmp ugt i64 [[TMP0]], [[TMP9]]
+; PRED-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[X]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 0
+; PRED-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP13]]
+; PRED-NEXT:    [[TMP15:%.*]] = getelementptr i16, ptr [[TMP14]], i32 0
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP15]], i32 2, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> poison)
+; PRED-NEXT:    [[TMP16:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> [[BROADCAST_SPLAT]], <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP17:%.*]] = udiv <vscale x 8 x i16> [[WIDE_MASKED_LOAD]], [[TMP16]]
+; PRED-NEXT:    [[TMP18:%.*]] = or <vscale x 8 x i16> [[TMP17]], [[VEC_PHI]]
+; PRED-NEXT:    [[TMP19]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> [[TMP18]], <vscale x 8 x i16> [[VEC_PHI]]
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP12]])
+; PRED-NEXT:    [[TMP20:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 8 x i1> [[TMP20]], i32 0
+; PRED-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    [[TMP22:%.*]] = call i16 @llvm.vector.reduce.or.nxv8i16(<vscale x 8 x i16> [[TMP19]])
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; PRED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, [[ENTRY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; PRED-NEXT:    br label [[LOOP:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[RED:%.*]] = phi i16 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]]
+; PRED-NEXT:    [[L:%.*]] = load i16, ptr [[GEP]], align 2
+; PRED-NEXT:    [[DIV:%.*]] = udiv i16 [[L]], [[X]]
+; PRED-NEXT:    [[RED_NEXT]] = or i16 [[DIV]], [[RED]]
+; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i16 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
+; PRED-NEXT:    ret i16 [[RED_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi i16 [ 0, %entry ], [ %red.next, %loop ]
+  %gep = getelementptr i16, ptr %src, i64 %iv
+  %l = load i16, ptr %gep, align 2
+  %div = udiv i16 %l, %x
+  %red.next = or i16 %div, %red
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i16 %red.next
+}
+
+attributes #0 = { "target-features"="+sve" }
+;.
+; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; DEFAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; DEFAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
+; PRED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; PRED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; PRED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; PRED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; PRED: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index ad6e8534f318..ddc004657ed5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -588,7 +588,7 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-UNORDERED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]]
 ; CHECK-UNORDERED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0
 ; CHECK-UNORDERED-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x float>, ptr [[TMP11]], align 4
-; CHECK-UNORDERED-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_VEC]])
+; CHECK-UNORDERED-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_VEC]])
 ; CHECK-UNORDERED-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
 ; CHECK-UNORDERED-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
 ; CHECK-UNORDERED-NEXT:    [[TMP14]] = fadd <vscale x 4 x float> [[TMP12]], [[VEC_PHI1]]
@@ -658,7 +658,7 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]]
 ; CHECK-ORDERED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
 ; CHECK-ORDERED-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x float>, ptr [[TMP9]], align 4
-; CHECK-ORDERED-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_VEC]])
+; CHECK-ORDERED-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_VEC]])
 ; CHECK-ORDERED-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
 ; CHECK-ORDERED-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
 ; CHECK-ORDERED-NEXT:    [[TMP12]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP11]])
@@ -733,9 +733,9 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
-; CHECK-ORDERED-TF-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 8 x i1> @llvm.experimental.vector.interleave2.nxv8i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-ORDERED-TF-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP15]], i32 4, <vscale x 8 x i1> [[INTERLEAVED_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_MASKED_VEC]])
+; CHECK-ORDERED-TF-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_MASKED_VEC]])
 ; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
 ; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[TMP17]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
new file mode 100644
index 000000000000..1e13d70083ff
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
@@ -0,0 +1,341 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p loop-vectorize -S %s | FileCheck --check-prefixes=DEFAULT %s
+; RUN: opt -p loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S %s | FileCheck --check-prefixes=PRED %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-macosx14.0.0"
+
+define void @cost_store_i8(ptr %dst) #0 {
+; DEFAULT-LABEL: define void @cost_store_i8(
+; DEFAULT-SAME: ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; DEFAULT-NEXT:  iter.check:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 101, [[TMP1]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; DEFAULT:       vector.main.loop.iter.check:
+; DEFAULT-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 101, [[TMP3]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 101, [[TMP5]]
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 101, [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 16
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
+; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
+; DEFAULT-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP10]], align 1
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
+; DEFAULT-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 101, [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; DEFAULT:       vec.epilog.iter.check:
+; DEFAULT-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 101, [[N_VEC]]
+; DEFAULT-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; DEFAULT-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP13]]
+; DEFAULT-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; DEFAULT:       vec.epilog.ph:
+; DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; DEFAULT-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 8
+; DEFAULT-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 101, [[TMP15]]
+; DEFAULT-NEXT:    [[N_VEC3:%.*]] = sub i64 101, [[N_MOD_VF2]]
+; DEFAULT-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 8
+; DEFAULT-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; DEFAULT:       vec.epilog.vector.body:
+; DEFAULT-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX5]], 0
+; DEFAULT-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP18]]
+; DEFAULT-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i32 0
+; DEFAULT-NEXT:    store <vscale x 8 x i8> zeroinitializer, ptr [[TMP20]], align 1
+; DEFAULT-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], [[TMP17]]
+; DEFAULT-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
+; DEFAULT-NEXT:    br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT:       vec.epilog.middle.block:
+; DEFAULT-NEXT:    [[CMP_N4:%.*]] = icmp eq i64 101, [[N_VEC3]]
+; DEFAULT-NEXT:    br i1 [[CMP_N4]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; DEFAULT:       vec.epilog.scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; DEFAULT-NEXT:    br label [[LOOP:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; DEFAULT-NEXT:    store i8 0, ptr [[GEP]], align 1
+; DEFAULT-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 100
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @cost_store_i8(
+; PRED-SAME: ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; PRED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; PRED-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 101, [[TMP4]]
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; PRED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 16
+; PRED-NEXT:    [[TMP9:%.*]] = sub i64 101, [[TMP8]]
+; PRED-NEXT:    [[TMP10:%.*]] = icmp ugt i64 101, [[TMP8]]
+; PRED-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 101)
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; PRED-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]]
+; PRED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
+; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> zeroinitializer, ptr [[TMP14]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]])
+; PRED-NEXT:    [[TMP15:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[TMP15]], i32 0
+; PRED-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; PRED-NEXT:    br label [[LOOP:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; PRED-NEXT:    store i8 0, ptr [[GEP]], align 1
+; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 100
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i8, ptr %dst, i64 %iv
+  store i8 0, ptr %gep, align 1
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
+; DEFAULT-LABEL: define void @trunc_store(
+; DEFAULT-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i16 [[X:%.*]]) #[[ATTR1:[0-9]+]] {
+; DEFAULT-NEXT:  iter.check:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; DEFAULT:       vector.memcheck:
+; DEFAULT-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 8
+; DEFAULT-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]]
+; DEFAULT-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[DST]]
+; DEFAULT-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; DEFAULT-NEXT:    br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; DEFAULT:       vector.main.loop.iter.check:
+; DEFAULT-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i16> poison, i16 [[X]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT3]], <16 x i16> poison, <16 x i32> zeroinitializer
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 16
+; DEFAULT-NEXT:    [[TMP4:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META5:![0-9]+]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TMP4]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP5:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT2]] to <16 x i8>
+; DEFAULT-NEXT:    [[TMP6:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT2]] to <16 x i8>
+; DEFAULT-NEXT:    [[TMP7:%.*]] = trunc <16 x i16> [[BROADCAST_SPLAT4]] to <16 x i8>
+; DEFAULT-NEXT:    [[TMP8:%.*]] = and <16 x i8> [[TMP5]], [[TMP7]]
+; DEFAULT-NEXT:    [[TMP9:%.*]] = and <16 x i8> [[TMP6]], [[TMP7]]
+; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]]
+; DEFAULT-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
+; DEFAULT-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i32 16
+; DEFAULT-NEXT:    store <16 x i8> [[TMP8]], ptr [[TMP12]], align 1, !alias.scope [[META8:![0-9]+]], !noalias [[META5]]
+; DEFAULT-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP13]], align 1, !alias.scope [[META8]], !noalias [[META5]]
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; DEFAULT-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; DEFAULT-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    br i1 true, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; DEFAULT:       vec.epilog.iter.check:
+; DEFAULT-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 2
+; DEFAULT-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP16]]
+; DEFAULT-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; DEFAULT:       vec.epilog.ph:
+; DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; DEFAULT-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 2
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP18]]
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 2
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[X]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT6]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+; DEFAULT-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; DEFAULT:       vec.epilog.vector.body:
+; DEFAULT-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX5]], 0
+; DEFAULT-NEXT:    [[TMP22:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META11:![0-9]+]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP22]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP23:%.*]] = trunc <vscale x 2 x i64> [[BROADCAST_SPLAT]] to <vscale x 2 x i8>
+; DEFAULT-NEXT:    [[TMP24:%.*]] = trunc <vscale x 2 x i16> [[BROADCAST_SPLAT7]] to <vscale x 2 x i8>
+; DEFAULT-NEXT:    [[TMP25:%.*]] = and <vscale x 2 x i8> [[TMP23]], [[TMP24]]
+; DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP21]]
+; DEFAULT-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
+; DEFAULT-NEXT:    store <vscale x 2 x i8> [[TMP25]], ptr [[TMP27]], align 1, !alias.scope [[META14:![0-9]+]], !noalias [[META11]]
+; DEFAULT-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], [[TMP20]]
+; DEFAULT-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; DEFAULT:       vec.epilog.middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; DEFAULT:       vec.epilog.scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; DEFAULT-NEXT:    br label [[LOOP:%.*]]
+; DEFAULT:       loop:
+; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; DEFAULT-NEXT:    [[X_EXT:%.*]] = zext i16 [[X]] to i64
+; DEFAULT-NEXT:    [[L:%.*]] = load i64, ptr [[SRC]], align 8
+; DEFAULT-NEXT:    [[AND:%.*]] = and i64 [[L]], [[X_EXT]]
+; DEFAULT-NEXT:    [[TRUNC:%.*]] = trunc i64 [[AND]] to i8
+; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; DEFAULT-NEXT:    store i8 [[TRUNC]], ptr [[GEP]], align 1
+; DEFAULT-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; DEFAULT-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; DEFAULT-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret void
+;
+; PRED-LABEL: define void @trunc_store(
+; PRED-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i16 [[X:%.*]]) #[[ATTR1:[0-9]+]] {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; PRED:       vector.memcheck:
+; PRED-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 8
+; PRED-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]]
+; PRED-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[DST]]
+; PRED-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; PRED-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i16> poison, i16 [[X]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT1]], <16 x i16> poison, <16 x i32> zeroinitializer
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; PRED-NEXT:    [[TMP1:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META4:![0-9]+]]
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TMP1]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP2:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT]] to <16 x i8>
+; PRED-NEXT:    [[TMP3:%.*]] = trunc <16 x i16> [[BROADCAST_SPLAT2]] to <16 x i8>
+; PRED-NEXT:    [[TMP4:%.*]] = and <16 x i8> [[TMP2]], [[TMP3]]
+; PRED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
+; PRED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
+; PRED-NEXT:    store <16 x i8> [[TMP4]], ptr [[TMP6]], align 1, !alias.scope [[META7:![0-9]+]], !noalias [[META4]]
+; PRED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; PRED-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; PRED-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; PRED-NEXT:    br label [[LOOP:%.*]]
+; PRED:       loop:
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; PRED-NEXT:    [[X_EXT:%.*]] = zext i16 [[X]] to i64
+; PRED-NEXT:    [[L:%.*]] = load i64, ptr [[SRC]], align 8
+; PRED-NEXT:    [[AND:%.*]] = and i64 [[L]], [[X_EXT]]
+; PRED-NEXT:    [[TRUNC:%.*]] = trunc i64 [[AND]] to i8
+; PRED-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; PRED-NEXT:    store i8 [[TRUNC]], ptr [[GEP]], align 1
+; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %x.ext = zext i16 %x to i64
+  %l = load i64, ptr %src, align 8
+  %and = and i64 %l, %x.ext
+  %trunc = trunc i64 %and to i8
+  %gep = getelementptr i8, ptr %dst, i64 %iv
+  store i8 %trunc, ptr %gep, align 1
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 0
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve" }
+attributes #1 = { vscale_range(1,16) "target-features"="+sve" }
+
+
+;.
+; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; DEFAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
+; DEFAULT: [[META5]] = !{[[META6:![0-9]+]]}
+; DEFAULT: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]]}
+; DEFAULT: [[META7]] = distinct !{[[META7]], !"LVerDomain"}
+; DEFAULT: [[META8]] = !{[[META9:![0-9]+]]}
+; DEFAULT: [[META9]] = distinct !{[[META9]], [[META7]]}
+; DEFAULT: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; DEFAULT: [[META11]] = !{[[META12:![0-9]+]]}
+; DEFAULT: [[META12]] = distinct !{[[META12]], [[META13:![0-9]+]]}
+; DEFAULT: [[META13]] = distinct !{[[META13]], !"LVerDomain"}
+; DEFAULT: [[META14]] = !{[[META15:![0-9]+]]}
+; DEFAULT: [[META15]] = distinct !{[[META15]], [[META13]]}
+; DEFAULT: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]]}
+;.
+; PRED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; PRED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; PRED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; PRED: [[META4]] = !{[[META5:![0-9]+]]}
+; PRED: [[META5]] = distinct !{[[META5]], [[META6:![0-9]+]]}
+; PRED: [[META6]] = distinct !{[[META6]], !"LVerDomain"}
+; PRED: [[META7]] = !{[[META8:![0-9]+]]}
+; PRED: [[META8]] = distinct !{[[META8]], [[META6]]}
+; PRED: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
+; PRED: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index c07b3c8d4922..1853e551806b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -38,7 +38,7 @@ define void @test_array_load2_store2(i32 %C, i32 %D) #1 {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
@@ -46,7 +46,7 @@ define void @test_array_load2_store2(i32 %C, i32 %D) #1 {
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw <vscale x 4 x i32> [[TMP4]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 -4
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
@@ -134,7 +134,7 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i64> [[TMP7]], i64 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -4
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP12]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP12]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
@@ -216,7 +216,7 @@ define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
@@ -401,11 +401,11 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
+; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
+; CHECK-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
@@ -414,9 +414,9 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[TMP17:%.*]] = sub nsw i32 1, [[TMP16]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]]
-; CHECK-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP12]])
-; CHECK-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE2]], <vscale x 4 x i32> [[REVERSE3]])
+; CHECK-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP12]])
+; CHECK-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE2]], <vscale x 4 x i32> [[REVERSE3]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
@@ -483,7 +483,7 @@ define void @even_load_static_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nsw <vscale x 4 x i32> [[TMP5]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP7:%.*]] = and i64 [[INDEX]], 9223372036854775804
@@ -569,7 +569,7 @@ define void @even_load_dynamic_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = shl nsw <vscale x 4 x i32> [[TMP13]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP15:%.*]] = and i64 [[INDEX]], 9223372036854775804
@@ -717,18 +717,18 @@ define void @mixed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias n
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw <vscale x 4 x i32> [[TMP4]], [[TMP3]]
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <vscale x 4 x i32> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 -4
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP9]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP9]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
@@ -811,7 +811,7 @@ define void @int_float_struct(ptr nocapture readonly %p) #0 {
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 undef, i32 0), [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x i32> [[TMP4]] to <vscale x 4 x float>
@@ -910,7 +910,7 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x ptr> [[TMP12]], i64 0
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -994,12 +994,12 @@ define i32 @PR27626_1(ptr %p, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]], i32 1
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x ptr> [[TMP13]], i64 0
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP15]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
 ; CHECK-NEXT:    [[TMP17]] = add <vscale x 4 x i32> [[TMP16]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -1092,7 +1092,7 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]], i32 1
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x ptr> [[TMP14]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -1181,11 +1181,11 @@ define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[TMP12]], i32 1
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> [[TMP15]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
 ; CHECK-NEXT:    [[TMP18]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -1291,7 +1291,7 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -4
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> [[BROADCAST_SPLAT4]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> [[BROADCAST_SPLAT4]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
@@ -1497,7 +1497,7 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP21:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[A]], <vscale x 4 x i64> [[TMP19]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER4]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP22]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison), !alias.scope [[META34]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_MASKED_GATHER4]], i32 -1)
+; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_MASKED_GATHER4]], i32 -1)
 ; CHECK-NEXT:    [[TMP24:%.*]] = sext <vscale x 4 x i16> [[TMP23]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP25:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER4]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP26:%.*]] = mul nsw <vscale x 4 x i32> [[TMP24]], [[TMP21]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 3ba91360850e..726d98f4d37d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -52,9 +52,9 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[INDEX]], 1
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP10]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
-; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = or disjoint i32 [[TMP8]], 1
@@ -63,8 +63,8 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP13]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 -1
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
 ; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP20]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
@@ -134,9 +134,9 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl i32 [[INDEX]], 1
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP9]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.experimental.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = or disjoint i32 [[TMP7]], 1
@@ -145,8 +145,8 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP13]] to i64
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 -1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.experimental.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP20]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
index 1dfa7f8fe18b..cf4d65318b7e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
@@ -178,7 +178,7 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NOTF-NOT:     %{{.*}} = phi <vscale x 4 x i1>
 ; CHECK-NOTF:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-NOTF:         %[[LOAD]] = load <vscale x 4 x i32>
-; CHECK-NOTF:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-NOTF:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-NOTF:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-NOTF:         store <vscale x 4 x i32> %[[ADD]]
 
@@ -191,7 +191,7 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-TF-NORED:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
 ; CHECK-TF-NORED:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-TF-NORED:         %[[LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0({{.*}} %[[ACTIVE_LANE_MASK]]
-; CHECK-TF-NORED:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-TF-NORED:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-TF-NORED:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-TF-NORED:         call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %[[ADD]], {{.*}} <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]])
 
@@ -204,7 +204,7 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-TF-NOREC-NOT:     %{{.*}} = phi <vscale x 4 x i1>
 ; CHECK-TF-NOREC:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-TF-NOREC:         %[[LOAD]] = load <vscale x 4 x i32>
-; CHECK-TF-NOREC:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-TF-NOREC:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-TF-NOREC:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-TF-NOREC:         store <vscale x 4 x i32> %[[ADD]]
 
@@ -217,7 +217,7 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-TF-NOREV:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
 ; CHECK-TF-NOREV:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-TF-NOREV:         %[[LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0({{.*}} %[[ACTIVE_LANE_MASK]]
-; CHECK-TF-NOREV:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-TF-NOREV:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-TF-NOREV:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-TF-NOREV:         call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %[[ADD]], {{.*}} <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]])
 
@@ -230,7 +230,7 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-TF:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
 ; CHECK-TF:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-TF:         %[[LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0({{.*}} %[[ACTIVE_LANE_MASK]]
-; CHECK-TF:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-TF:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-TF:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-TF:         call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %[[ADD]], {{.*}} <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]])
 
@@ -243,7 +243,7 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-TF-ONLYRED-NOT:     %{{.*}} = phi <vscale x 4 x i1>
 ; CHECK-TF-ONLYRED:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-TF-ONLYRED:         %[[LOAD]] = load <vscale x 4 x i32>
-; CHECK-TF-ONLYRED:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-TF-ONLYRED:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-TF-ONLYRED:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-TF-ONLYRED:         store <vscale x 4 x i32> %[[ADD]]
 
@@ -256,7 +256,7 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEOVERSE-V1-NOT:     %{{.*}} = phi <vscale x 4 x i1>
 ; CHECK-NEOVERSE-V1:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-NEOVERSE-V1:         %[[LOAD]] = load <vscale x 4 x i32>
-; CHECK-NEOVERSE-V1:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-NEOVERSE-V1:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-NEOVERSE-V1:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-NEOVERSE-V1:         store <vscale x 4 x i32> %[[ADD]]
 
@@ -350,30 +350,30 @@ define void @reverse(ptr noalias %dst, ptr noalias %src) #0 {
 ; CHECK-NOTF:       vector.body:
 ; CHECK-NOTF-NOT:     %{{.*}} = phi <vscale x 4 x i1>
 ; CHECK-NOTF:         %[[LOAD:.*]] = load <vscale x 2 x double>, ptr
-; CHECK-NOTF:         %{{.*}} = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> %[[LOAD]])
+; CHECK-NOTF:         %{{.*}} = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> %[[LOAD]])
 
 ; CHECK-TF-NOREV-LABEL: @reverse(
 ; CHECK-TF-NOREV:       vector.body:
 ; CHECK-TF-NOREV-NOT:     %{{.*}} = phi <vscale x 4 x i1>
 ; CHECK-TF-NOREV:         %[[LOAD:.*]] = load <vscale x 2 x double>, ptr
-; CHECK-TF-NOREV:         %{{.*}} = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> %[[LOAD]])
+; CHECK-TF-NOREV:         %{{.*}} = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> %[[LOAD]])
 
 ; CHECK-TF-LABEL: @reverse(
 ; CHECK-TF:       vector.body:
 ; CHECK-TF:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 2 x i1>
-; CHECK-TF:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
+; CHECK-TF:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
 ; CHECK-TF:         %[[MASKED_LOAD:.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0({{.*}} <vscale x 2 x i1> %reverse
 
 ; CHECK-TF-NORED-LABEL: @reverse(
 ; CHECK-TF-NORED:       vector.body:
 ; CHECK-TF-NORED:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 2 x i1>
-; CHECK-TF-NORED:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
+; CHECK-TF-NORED:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
 ; CHECK-TF-NORED:         %[[MASKED_LOAD:.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0({{.*}} <vscale x 2 x i1> %reverse
 
 ; CHECK-TF-NOREC-LABEL: @reverse(
 ; CHECK-TF-NOREC:       vector.body:
 ; CHECK-TF-NOREC:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 2 x i1>
-; CHECK-TF-NOREC:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
+; CHECK-TF-NOREC:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
 ; CHECK-TF-NOREC:         %[[MASKED_LOAD:.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0({{.*}} <vscale x 2 x i1> %reverse
 
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
index 70833e44b075..9485d827ced4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
@@ -1,5 +1,5 @@
 ; This is the loop in c++ being vectorize in this file with
-; experimental.vector.reverse
+; vector.reverse
 
 ;#pragma clang loop vectorize_width(4, scalable)
 ;  for (long int i = N - 1; i >= 0; i--)
@@ -18,12 +18,12 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @vector_reverse_mask_nxv4i1(ptr %a, ptr %cond, i64 %N) #0 {
 ; CHECK-LABEL: vector.body:
-; CHECK: %[[REVERSE6:.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
+; CHECK: %[[REVERSE6:.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
 ; CHECK: %[[WIDEMSKLOAD:.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE6]], <vscale x 4 x double> poison)
-; CHECK: %[[REVERSE7:.*]] = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> %[[WIDEMSKLOAD]])
+; CHECK: %[[REVERSE7:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[WIDEMSKLOAD]])
 ; CHECK: %[[FADD:.*]] = fadd <vscale x 4 x double> %[[REVERSE7]]
-; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
-; CHECK: %[[REVERSE8:.*]] = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> %[[FADD]])
+; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
+; CHECK: %[[REVERSE8:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[FADD]])
 ; CHECK: call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> %[[REVERSE8]], ptr %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE9]]
 
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
index e35a4db36905..e3bba1338e1d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; This is the loop in c++ being vectorize in this file with
-;experimental.vector.reverse
+;vector.reverse
 ;  #pragma clang loop vectorize_width(8, scalable) interleave_count(2)
 ;  for (int i = N-1; i >= 0; --i)
 ;    a[i] = b[i] + 1.0;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index 126ceac7325a..61105e51cb94 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -40,10 +40,10 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <vscale x 8 x i32>, ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC2]])
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC2]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC3]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC3]], 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <vscale x 4 x i32> [[TMP11]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
index 5c1966fa7a2d..0f524561eadc 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
@@ -107,9 +107,10 @@ while.body:
   %1 = load i8, ptr %b.addr.07, align 1
   %add = add i8 %1, %0
   %incdec.ptr4 = getelementptr inbounds i8, ptr %c.addr.08, i32 1
-  store i8 %add, ptr %c.addr.08, align 1
   %cmp = icmp sgt i32 %N.addr.09, 1
   %select = select i1 %cmp, i8 %0, i8 %1
+  %add2 = add i8 %add, %select
+  store i8 %add2, ptr %c.addr.08, align 1
   br i1 %cmp, label %while.body, label %while.end.loopexit
 
 while.end.loopexit:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 72d9691b2bb8..c3374fceb1fb 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; This is the loop in c++ being vectorize in this file with
-;experimental.vector.reverse
+;vector.reverse
 ;  #pragma clang loop vectorize_width(4, scalable)
 ;  for (int i = N-1; i >= 0; --i)
 ;    a[i] = b[i] + 1.0;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index f2222e0a1f93..0dee4a9b8585 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -46,9 +46,9 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = sub i64 1, [[TMP14]]
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP15]]
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]]
-; IF-EVL-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; IF-EVL-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
 ; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[REVERSE]], <vscale x 4 x i32> poison)
-; IF-EVL-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]])
+; IF-EVL-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]])
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
@@ -56,8 +56,8 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    [[TMP23:%.*]] = sub i64 1, [[TMP21]]
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP22]]
 ; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
-; IF-EVL-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; IF-EVL-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[REVERSE3]])
+; IF-EVL-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; IF-EVL-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[REVERSE3]])
 ; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[REVERSE5]], ptr [[TMP25]], i32 4, <vscale x 4 x i1> [[REVERSE4]])
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; IF-EVL-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll
index 0b9b592627c6..c4f9c404a926 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll
@@ -1328,13 +1328,11 @@ define void @unknown_inner_stride(ptr nocapture noundef %dst, ptr nocapture noun
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw i64 [[TMP14]], [[TMP0]]
-; CHECK-NEXT:    [[TMP16:%.*]] = add nsw i64 [[TMP15]], [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add nsw i64 [[TMP14]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP18]], align 4, !alias.scope [[META60:![0-9]+]]
-; CHECK-NEXT:    [[TMP19:%.*]] = mul nsw i64 [[TMP14]], [[TMP1]]
-; CHECK-NEXT:    [[TMP20:%.*]] = add nsw i64 [[TMP19]], [[TMP12]]
+; CHECK-NEXT:    [[TMP20:%.*]] = add nsw i64 [[TMP14]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP20]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4, !alias.scope [[META63:![0-9]+]], !noalias [[META60]]
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
index 3be31c011eaa..d64755999635 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
@@ -21,7 +21,7 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
 ; CHECK-VF4UF1: %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %[[NEXT_IDX:.*]], %vector.body ]
 ; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[VEC_RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-VF4UF1: %[[LOAD]] = load <vscale x 4 x i32>, ptr
-; CHECK-VF4UF1: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VEC_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-VF4UF1: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VEC_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-VF4UF1: middle.block:
 ; CHECK-VF4UF1: %[[VSCALE2:.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF1: %[[MUL2:.*]] = mul i32 %[[VSCALE2]], 4
@@ -70,7 +70,7 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF1: vector.body:
 ; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[VEC_RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-VF4UF1: %[[LOAD]] = load <vscale x 4 x i32>, ptr
-; CHECK-VF4UF1: %[[REVERSE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VEC_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-VF4UF1: %[[REVERSE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VEC_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
 ; CHECK-VF4UF1: middle.block:
 ; CHECK-VF4UF1: %[[VSCALE2:.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF1: %[[MUL2:.*]] = mul i32 %[[VSCALE2]], 4
@@ -119,7 +119,7 @@ define void @recurrence_3(ptr nocapture readonly %a, ptr nocapture %b, i32 %n, f
 ; CHECK-VF4UF1: vector.body:
 ; CHECK-VF4UF1: %vector.recur = phi <vscale x 4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[L1:.*]], %vector.body ]
 ; CHECK-VF4UF1: %[[L1]] = load <vscale x 4 x i16>, ptr
-; CHECK-VF4UF1: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %vector.recur, <vscale x 4 x i16> %[[L1]], i32 -1)
+; CHECK-VF4UF1: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %vector.recur, <vscale x 4 x i16> %[[L1]], i32 -1)
 ; Check also that the casts were not moved needlessly.
 ; CHECK-VF4UF1: sitofp <vscale x 4 x i16> %[[L1]] to <vscale x 4 x double>
 ; CHECK-VF4UF1: sitofp <vscale x 4 x i16> %[[SPLICE]] to <vscale x 4 x double>
@@ -169,8 +169,8 @@ define i64 @constant_folded_previous_value() {
 ; CHECK-VF4UF2-LABEL: @constant_folded_previous_value
 ; CHECK-VF4UF2: vector.body
 ; CHECK-VF4UF2: %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i64> [ %vector.recur.init, %vector.ph ], [ shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), %vector.body ]
-; CHECK-VF4UF2: %[[SPLICE1:.*]] = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> %vector.recur, <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), i32 -1)
-; CHECK-VF4UF2: %[[SPLICE2:.*]] = call <vscale x 4 x i64> @llvm.experimental.vector.splice.nxv4i64(<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), i32 -1)
+; CHECK-VF4UF2: %[[SPLICE1:.*]] = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> %vector.recur, <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), i32 -1)
+; CHECK-VF4UF2: %[[SPLICE2:.*]] = call <vscale x 4 x i64> @llvm.vector.splice.nxv4i64(<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), i32 -1)
 ; CHECK-VF4UF2: br i1 {{.*}}, label %middle.block, label %vector.body
 entry:
   br label %scalar.body
@@ -242,7 +242,7 @@ define void @sink_after(ptr %a, ptr %b, i64 %n) {
 ; CHECK-VF4UF1: vector.body
 ; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
 ; CHECK-VF4UF1: %[[LOAD]] = load <vscale x 4 x i16>, ptr
-; CHECK-VF4UF1-NEXT: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %[[VEC_RECUR]], <vscale x 4 x i16> %[[LOAD]], i32 -1)
+; CHECK-VF4UF1-NEXT: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %[[VEC_RECUR]], <vscale x 4 x i16> %[[LOAD]], i32 -1)
 ; CHECK-VF4UF1-NEXT: sext <vscale x 4 x i16> %[[SPLICE]] to <vscale x 4 x i32>
 ; CHECK-VF4UF1-NEXT: sext <vscale x 4 x i16> %[[LOAD]] to <vscale x 4 x i32>
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
new file mode 100644
index 000000000000..d09066fa2d70
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
@@ -0,0 +1,428 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+
+define void @test_versioned_with_sext_use(i32 %offset, ptr %dst) {
+; CHECK-LABEL: define void @test_versioned_with_sext_use(
+; CHECK-SAME: i32 [[OFFSET:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OFFSET_EXT:%.*]] = sext i32 [[OFFSET]] to i64
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header.loopexit:
+; CHECK-NEXT:    [[IV_2_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_2_NEXT:%.*]], [[INNER_LOOP:%.*]] ], [ [[IND_END:%.*]], [[MIDDLE_BLOCK:%.*]] ]
+; CHECK-NEXT:    br label [[OUTER_HEADER]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_2_NEXT_LCSSA]], [[OUTER_HEADER_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[C:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C]], label [[INNER_LOOP_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       inner.loop.preheader:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[OFFSET]], 1
+; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 200, [[OFFSET_EXT]]
+; CHECK-NEXT:    [[IND_END]] = add i64 [[IV_1]], [[TMP0]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[INDEX]], [[OFFSET_EXT]]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[IV_1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 0, [[OFFSET_EXT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP3]], 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 false, label [[OUTER_HEADER_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[IV_1]], [[INNER_LOOP_PREHEADER]] ], [ [[IV_1]], [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[INNER_LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP]]
+; CHECK:       inner.loop:
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i64 [ [[IV_2_NEXT]], [[INNER_LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV_3:%.*]] = phi i32 [ [[IV_3_NEXT:%.*]], [[INNER_LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV_2]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_2]], [[OFFSET_EXT]]
+; CHECK-NEXT:    [[IV_3_NEXT]] = add i32 [[IV_3]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_3]], 200
+; CHECK-NEXT:    br i1 [[EC]], label [[OUTER_HEADER_LOOPEXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %offset.ext = sext i32 %offset to i64
+  br label %outer.header
+
+outer.header:
+  %iv.1 = phi i64 [ 0, %entry ], [ %iv.2.next, %inner.loop ]
+  %c = call i1 @cond()
+  br i1 %c, label %inner.loop, label %exit
+
+inner.loop:
+  %iv.2 = phi i64 [ %iv.1, %outer.header ], [ %iv.2.next, %inner.loop ]
+  %iv.3 = phi i32 [ 0, %outer.header ], [ %iv.3.next, %inner.loop ]
+  %gep = getelementptr i32, ptr %dst, i64 %iv.2
+  store i32 0, ptr %gep, align 8
+  %iv.2.next = add i64 %iv.2, %offset.ext
+  %iv.3.next = add i32 %iv.3, 1
+  %ec = icmp eq i32 %iv.3, 200
+  br i1 %ec, label %outer.header, label %inner.loop
+
+exit:
+  ret void
+}
+
+define void @test_versioned_with_zext_use(i32 %offset, ptr %dst) {
+; CHECK-LABEL: define void @test_versioned_with_zext_use(
+; CHECK-SAME: i32 [[OFFSET:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OFFSET_EXT:%.*]] = zext i32 [[OFFSET]] to i64
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header.loopexit:
+; CHECK-NEXT:    [[IV_2_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_2_NEXT:%.*]], [[INNER_LOOP:%.*]] ], [ [[IND_END:%.*]], [[MIDDLE_BLOCK:%.*]] ]
+; CHECK-NEXT:    br label [[OUTER_HEADER]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_2_NEXT_LCSSA]], [[OUTER_HEADER_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[C:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C]], label [[INNER_LOOP_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       inner.loop.preheader:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[OFFSET]], 1
+; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 200, [[OFFSET_EXT]]
+; CHECK-NEXT:    [[IND_END]] = add i64 [[IV_1]], [[TMP0]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[INDEX]], [[OFFSET_EXT]]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[IV_1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 0, [[OFFSET_EXT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP3]], 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 false, label [[OUTER_HEADER_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[IV_1]], [[INNER_LOOP_PREHEADER]] ], [ [[IV_1]], [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[INNER_LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP]]
+; CHECK:       inner.loop:
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i64 [ [[IV_2_NEXT]], [[INNER_LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV_3:%.*]] = phi i32 [ [[IV_3_NEXT:%.*]], [[INNER_LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV_2]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_2]], [[OFFSET_EXT]]
+; CHECK-NEXT:    [[IV_3_NEXT]] = add i32 [[IV_3]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_3]], 200
+; CHECK-NEXT:    br i1 [[EC]], label [[OUTER_HEADER_LOOPEXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %offset.ext = zext i32 %offset to i64
+  br label %outer.header
+
+outer.header:
+  %iv.1 = phi i64 [ 0, %entry ], [ %iv.2.next, %inner.loop ]
+  %c = call i1 @cond()
+  br i1 %c, label %inner.loop, label %exit
+
+inner.loop:
+  %iv.2 = phi i64 [ %iv.1, %outer.header ], [ %iv.2.next, %inner.loop ]
+  %iv.3 = phi i32 [ 0, %outer.header ], [ %iv.3.next, %inner.loop ]
+  %gep = getelementptr i32, ptr %dst, i64 %iv.2
+  store i32 0, ptr %gep, align 8
+  %iv.2.next = add i64 %iv.2, %offset.ext
+  %iv.3.next = add i32 %iv.3, 1
+  %ec = icmp eq i32 %iv.3, 200
+  br i1 %ec, label %outer.header, label %inner.loop
+
+exit:
+  ret void
+}
+
+define void @versioned_sext_use_in_gep(i32 %scale, ptr %dst, i64 %scale.2) {
+; CHECK-LABEL: define void @versioned_sext_use_in_gep(
+; CHECK-SAME: i32 [[SCALE:%.*]], ptr [[DST:%.*]], i64 [[SCALE_2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SCALE_EXT:%.*]] = sext i32 [[SCALE]] to i64
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[SCALE]], 1
+; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[DST]], i64 [[SCALE_2]]
+; CHECK-NEXT:    store ptr [[TMP8]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    store ptr [[TMP8]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    store ptr [[TMP8]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store ptr [[TMP8]], ptr [[TMP17]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_MUL:%.*]] = mul i64 [[IV]], [[SCALE_EXT]]
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_MUL]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[SCALE_MUL:%.*]] = mul i64 [[SCALE_EXT]], [[SCALE_2]]
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr i8, ptr [[DST]], i64 [[SCALE_MUL]]
+; CHECK-NEXT:    store ptr [[GEP_2]], ptr [[GEP_1]], align 8
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 256
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %scale.ext = sext i32 %scale to i64
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.mul  = mul i64 %iv, %scale.ext
+  %gep.1 = getelementptr i8, ptr %dst, i64 %iv.mul
+  %iv.next = add i64 %iv, 1
+  %scale.mul = mul i64 %scale.ext, %scale.2
+  %gep.2 = getelementptr i8, ptr %dst, i64 %scale.mul
+  store ptr %gep.2, ptr %gep.1, align 8
+  %ec = icmp eq i64 %iv.next, 256
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare i1 @cond()
+
+define void @test_versioned_with_different_uses(i32 %offset, ptr noalias %dst.1, ptr %dst.2) {
+; CHECK-LABEL: define void @test_versioned_with_different_uses(
+; CHECK-SAME: i32 [[OFFSET:%.*]], ptr noalias [[DST_1:%.*]], ptr [[DST_2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OFFSET_EXT:%.*]] = zext i32 [[OFFSET]] to i64
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header.loopexit:
+; CHECK-NEXT:    [[IV_2_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_2_NEXT:%.*]], [[INNER_LOOP:%.*]] ], [ [[IND_END:%.*]], [[MIDDLE_BLOCK:%.*]] ]
+; CHECK-NEXT:    br label [[OUTER_HEADER]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_2_NEXT_LCSSA]], [[OUTER_HEADER_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[C:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C]], label [[INNER_LOOP_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       inner.loop.preheader:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[OFFSET]], 1
+; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 200, [[OFFSET_EXT]]
+; CHECK-NEXT:    [[IND_END]] = add i64 [[IV_1]], [[TMP0]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[INDEX]], [[OFFSET_EXT]]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[IV_1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 0, [[OFFSET_EXT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], [[TMP2]]
+; CHECK-NEXT:    [[OFFSET_IDX2:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[OFFSET_IDX2]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[OFFSET_IDX2]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[OFFSET_IDX2]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[OFFSET_IDX2]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP7]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP8]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[TMP9]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[TMP10]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[DST_2]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 false, label [[OUTER_HEADER_LOOPEXIT]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[IV_1]], [[INNER_LOOP_PREHEADER]] ], [ [[IV_1]], [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[INNER_LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP]]
+; CHECK:       inner.loop:
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i64 [ [[IV_2_NEXT]], [[INNER_LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV_3:%.*]] = phi i32 [ [[IV_3_NEXT:%.*]], [[INNER_LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV_MUL:%.*]] = mul i32 [[IV_3]], [[OFFSET]]
+; CHECK-NEXT:    [[GEP_MUL:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[IV_MUL]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP_MUL]], align 8
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST_2]], i64 [[IV_2]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_2]], [[OFFSET_EXT]]
+; CHECK-NEXT:    [[IV_3_NEXT]] = add i32 [[IV_3]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_3]], 200
+; CHECK-NEXT:    br i1 [[EC]], label [[OUTER_HEADER_LOOPEXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %offset.ext = zext i32 %offset to i64
+  br label %outer.header
+
+outer.header:
+  %iv.1 = phi i64 [ 0, %entry ], [ %iv.2.next, %inner.loop ]
+  %c = call i1 @cond()
+  br i1 %c, label %inner.loop, label %exit
+
+inner.loop:
+  %iv.2 = phi i64 [ %iv.1, %outer.header ], [ %iv.2.next, %inner.loop ]
+  %iv.3 = phi i32 [ 0, %outer.header ], [ %iv.3.next, %inner.loop ]
+  %iv.mul  = mul i32 %iv.3, %offset
+  %gep.mul = getelementptr i8, ptr %dst.1, i32 %iv.mul
+  store i32 0, ptr %gep.mul, align 8
+  %gep = getelementptr i32, ptr %dst.2, i64 %iv.2
+  store i32 0, ptr %gep, align 8
+  %iv.2.next = add i64 %iv.2, %offset.ext
+  %iv.3.next = add i32 %iv.3, 1
+  %ec = icmp eq i32 %iv.3, 200
+  br i1 %ec, label %outer.header, label %inner.loop
+
+exit:
+  ret void
+}
+
+define void @test_versioned_with_non_ex_use(i32 %offset, ptr noalias %dst.1, ptr %dst.2) {
+; CHECK-LABEL: define void @test_versioned_with_non_ex_use(
+; CHECK-SAME: i32 [[OFFSET:%.*]], ptr noalias [[DST_1:%.*]], ptr [[DST_2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OFFSET_EXT:%.*]] = zext i32 [[OFFSET]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[OFFSET]], 3
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i32 -3, [[OFFSET]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[ADD]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 [[TMP0]], i32 [[ADD]]
+; CHECK-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP2]], i32 200)
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i32 0, [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt i32 [[MUL_RESULT]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP1]], i1 [[TMP5]], i1 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP6]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[OFFSET]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[IDENT_CHECK]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[ADD]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP10]], i32 2
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP17]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP12]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[TMP16]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[TMP18]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[DST_2]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[TMP20]], i32 0
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP21]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_3:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_3_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_MUL:%.*]] = mul i32 [[IV_3]], [[ADD]]
+; CHECK-NEXT:    [[GEP_MUL:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[IV_MUL]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP_MUL]], align 8
+; CHECK-NEXT:    [[IV_2_MUL:%.*]] = mul i64 [[IV_2]], [[OFFSET_EXT]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST_2]], i64 [[IV_2_MUL]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_2]], 1
+; CHECK-NEXT:    [[IV_3_NEXT]] = add i32 [[IV_3]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_3]], 200
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %offset.ext = zext i32 %offset to i64
+  %add = add i32 %offset, 3
+  br label %loop
+
+loop:
+  %iv.2 = phi i64 [ 0, %entry ], [ %iv.2.next, %loop ]
+  %iv.3 = phi i32 [ 0, %entry ], [ %iv.3.next, %loop ]
+  %iv.mul  = mul i32 %iv.3, %add
+  %gep.mul = getelementptr i8, ptr %dst.1, i32 %iv.mul
+  store i32 0, ptr %gep.mul, align 8
+  %iv.2.mul  = mul i64 %iv.2, %offset.ext
+  %gep = getelementptr i32, ptr %dst.2, i64 %iv.2.mul
+  store i32 0, ptr %gep, align 8
+  %iv.2.next = add i64 %iv.2, 1
+  %iv.3.next = add i32 %iv.3, 1
+  %ec = icmp eq i32 %iv.3, 200
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LowerTypeTests/cfi-nounwind-direct-call.ll b/llvm/test/Transforms/LowerTypeTests/cfi-nounwind-direct-call.ll
index 4c88f4acc12f..2795333effd7 100644
--- a/llvm/test/Transforms/LowerTypeTests/cfi-nounwind-direct-call.ll
+++ b/llvm/test/Transforms/LowerTypeTests/cfi-nounwind-direct-call.ll
@@ -109,8 +109,8 @@ attributes #6 = { noreturn nounwind }
 !11 = !{}
 !12 = !{!"branch_weights", i32 1048575, i32 1}
 ; CHECK: Function Attrs: minsize mustprogress nofree norecurse nosync nounwind optsize willreturn memory(none)
-; CHECK-LABEL: define dso_local noundef i32 @_Z9nothrow_ei
-; CHECK-SAME: (i32 noundef [[NUM:%.*]]) #[[ATTR0:[0-9]+]] !type !4 !type !5 !type !6 {
+; CHECK-LABEL: define dso_local noundef range(i32 0, 2) i32 @_Z9nothrow_ei
+; CHECK-SAME: (i32 noundef [[NUM:%.*]]) #[[ATTR0:[0-9]+]] !type [[META4:![0-9]+]] !type [[META5:![0-9]+]] !type [[META6:![0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ne i32 [[NUM]], 0
 ; CHECK-NEXT:    [[DOT:%.*]] = zext i1 [[TOBOOL_NOT]] to i32
@@ -118,8 +118,8 @@ attributes #6 = { noreturn nounwind }
 ;
 ;
 ; CHECK: Function Attrs: minsize mustprogress nofree norecurse nosync nounwind optsize willreturn memory(write, argmem: none, inaccessiblemem: none)
-; CHECK-LABEL: define dso_local noundef i32 @_Z10call_catchi
-; CHECK-SAME: (i32 noundef [[NUM:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] !type !4 !type !5 !type !6 {
+; CHECK-LABEL: define dso_local noundef range(i32 0, 2) i32 @_Z10call_catchi
+; CHECK-SAME: (i32 noundef [[NUM:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] !type [[META4]] !type [[META5]] !type [[META6]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    store ptr @_Z9nothrow_ei.cfi_jt, ptr @catch_ptr, align 8, !tbaa [[TBAA7:![0-9]+]]
 ; CHECK-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp ne i32 [[NUM]], 0
@@ -131,17 +131,17 @@ attributes #6 = { noreturn nounwind }
 ; CHECK-LABEL: define weak_odr hidden void @__cfi_check_fail
 ; CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq ptr [[TMP0]], null, !nosanitize !11
-; CHECK-NEXT:    br i1 [[DOTNOT]], label [[TRAP:%.*]], label [[CONT:%.*]], !nosanitize !11
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq ptr [[TMP0]], null, !nosanitize [[META11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[DOTNOT]], label [[TRAP:%.*]], label [[CONT:%.*]], !nosanitize [[META11]]
 ; CHECK:       trap:
-; CHECK-NEXT:    tail call void @llvm.ubsantrap(i8 2) #[[ATTR5:[0-9]+]], !nosanitize !11
-; CHECK-NEXT:    unreachable, !nosanitize !11
+; CHECK-NEXT:    tail call void @llvm.ubsantrap(i8 2) #[[ATTR6:[0-9]+]], !nosanitize [[META11]]
+; CHECK-NEXT:    unreachable, !nosanitize [[META11]]
 ; CHECK:       cont:
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP0]], align 4, !nosanitize !11
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP0]], align 4, !nosanitize [[META11]]
 ; CHECK-NEXT:    [[SWITCH:%.*]] = icmp ult i8 [[TMP2]], 5
 ; CHECK-NEXT:    br i1 [[SWITCH]], label [[TRAP]], label [[CONT6:%.*]]
 ; CHECK:       cont6:
-; CHECK-NEXT:    ret void, !nosanitize !11
+; CHECK-NEXT:    ret void, !nosanitize [[META11]]
 ;
 ;
 ; CHECK-LABEL: define weak void @__cfi_check
@@ -153,8 +153,8 @@ attributes #6 = { noreturn nounwind }
 ;
 ; CHECK: Function Attrs: naked nocf_check noinline nounwind
 ; CHECK-LABEL: define internal void @_Z9nothrow_ei.cfi_jt
-; CHECK-SAME: () #[[ATTR4:[0-9]+]] align 8 {
+; CHECK-SAME: () #[[ATTR5:[0-9]+]] align 8 {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    tail call void asm sideeffect "jmp ${0:c}@plt\0Aint3\0Aint3\0Aint3\0A", "s"(ptr nonnull @_Z9nothrow_ei) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    tail call void asm sideeffect "jmp ${0:c}@plt\0Aint3\0Aint3\0Aint3\0A", "s"(ptr nonnull @_Z9nothrow_ei) #[[ATTR7:[0-9]+]]
 ; CHECK-NEXT:    unreachable
 ;
diff --git a/llvm/test/Transforms/OpenMP/add_attributes.ll b/llvm/test/Transforms/OpenMP/add_attributes.ll
index 47ff5cad4e7e..ebcca3067f04 100644
--- a/llvm/test/Transforms/OpenMP/add_attributes.ll
+++ b/llvm/test/Transforms/OpenMP/add_attributes.ll
@@ -641,8 +641,6 @@ declare i32 @__tgt_target_teams_mapper(ptr, i64, ptr, i32, ptr, ptr, ptr, ptr, p
 
 declare i32 @__tgt_target_teams_nowait_mapper(ptr, i64, ptr, i32, ptr, ptr, ptr, ptr, ptr, ptr, i32, i32, i32, ptr, i32, ptr)
 
-declare void @__tgt_register_requires(i64)
-
 declare void @__tgt_target_data_begin_mapper(ptr, i64, i32, ptr, ptr, ptr, ptr, ptr, ptr)
 
 declare void @__tgt_target_data_begin_nowait_mapper(ptr, i64, i32, ptr, ptr, ptr, ptr, ptr, ptr)
@@ -1249,9 +1247,6 @@ declare i32 @__tgt_target_kernel_nowait(ptr, i64, i32, i32, ptr, ptr, i32, ptr,
 ; CHECK-NEXT: declare i32 @__tgt_target_teams_nowait_mapper(ptr, i64, ptr, i32, ptr, ptr, ptr, ptr, ptr, ptr, i32, i32, i32, ptr, i32, ptr)
 
 ; CHECK: ; Function Attrs: nounwind
-; CHECK-NEXT: declare void @__tgt_register_requires(i64)
-
-; CHECK: ; Function Attrs: nounwind
 ; CHECK-NEXT: declare void @__tgt_target_data_begin_mapper(ptr, i64, i32, ptr, ptr, ptr, ptr, ptr, ptr)
 
 ; CHECK: ; Function Attrs: nounwind
@@ -1894,9 +1889,6 @@ declare i32 @__tgt_target_kernel_nowait(ptr, i64, i32, i32, ptr, ptr, i32, ptr,
 ; OPTIMISTIC-NEXT: declare i32 @__tgt_target_teams_nowait_mapper(ptr, i64, ptr, i32, ptr, ptr, ptr, ptr, ptr, ptr, i32, i32, i32, ptr, i32, ptr)
 
 ; OPTIMISTIC: ; Function Attrs: nounwind
-; OPTIMISTIC-NEXT: declare void @__tgt_register_requires(i64)
-
-; OPTIMISTIC: ; Function Attrs: nounwind
 ; OPTIMISTIC-NEXT: declare void @__tgt_target_data_begin_mapper(ptr, i64, i32, ptr, ptr, ptr, ptr, ptr, ptr)
 
 ; OPTIMISTIC: ; Function Attrs: nounwind
@@ -2552,9 +2544,6 @@ declare i32 @__tgt_target_kernel_nowait(ptr, i64, i32, i32, ptr, ptr, i32, ptr,
 ; EXT-NEXT: declare signext i32 @__tgt_target_teams_nowait_mapper(ptr, i64, ptr, i32 signext, ptr, ptr, ptr, ptr, ptr, ptr, i32 signext, i32 signext, i32 signext, ptr, i32 signext, ptr)
 
 ; EXT: ; Function Attrs: nounwind
-; EXT-NEXT: declare void @__tgt_register_requires(i64)
-
-; EXT: ; Function Attrs: nounwind
 ; EXT-NEXT: declare void @__tgt_target_data_begin_mapper(ptr, i64, i32 signext, ptr, ptr, ptr, ptr, ptr, ptr)
 
 ; EXT: ; Function Attrs: nounwind
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll
index 9206893cb234..c133852f6693 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll
@@ -7,7 +7,7 @@ target triple = "aarch64"
 ; Check that the function gets vectorized.
 
 define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
-; CHECK-LABEL: define i32 @quant_4x4
+; CHECK-LABEL: define range(i32 0, 2) i32 @quant_4x4
 ; CHECK-SAME: (ptr nocapture noundef [[DCT:%.*]], ptr nocapture noundef readonly [[MF:%.*]], ptr nocapture noundef readonly [[BIAS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DCT]], i64 32
diff --git a/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll b/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll
index 67d721b23d6f..35d5ceeb9195 100644
--- a/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll
+++ b/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll
@@ -2,7 +2,7 @@
 ; RUN: opt -O1 -S < %s  | FileCheck %s
 
 define i32 @testa(i32 %mul) {
-; CHECK-LABEL: define i32 @testa(
+; CHECK-LABEL: define range(i32 -65536, 65536) i32 @testa(
 ; CHECK-SAME: i32 [[MUL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[MUL]], 15
 ; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = tail call i32 @llvm.smin.i32(i32 [[SHR]], i32 32767)
@@ -16,7 +16,7 @@ define i32 @testa(i32 %mul) {
 }
 
 define i32 @testb(i32 %mul) {
-; CHECK-LABEL: define i32 @testb(
+; CHECK-LABEL: define range(i32 -16777216, 16777216) i32 @testb(
 ; CHECK-SAME: i32 [[MUL:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[SHR102:%.*]] = ashr i32 [[MUL]], 7
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.smax.i32(i32 [[SHR102]], i32 -128)
diff --git a/llvm/test/Transforms/PhaseOrdering/min_max_loop.ll b/llvm/test/Transforms/PhaseOrdering/min_max_loop.ll
index fb338a6507eb..63cfef6f3d09 100644
--- a/llvm/test/Transforms/PhaseOrdering/min_max_loop.ll
+++ b/llvm/test/Transforms/PhaseOrdering/min_max_loop.ll
@@ -19,7 +19,7 @@
 ;; }
 
 define i16 @vecreduce_smin_v2i16(i32 %n, ptr %v) {
-; CHECK-LABEL: define i16 @vecreduce_smin_v2i16(
+; CHECK-LABEL: define range(i16 -32768, 1) i16 @vecreduce_smin_v2i16(
 ; CHECK:    @llvm.smin.v2i16
 
 entry:
@@ -65,7 +65,7 @@ for.end:                                          ; preds = %for.cond
 }
 
 define i16 @vecreduce_smax_v2i16(i32 %n, ptr %v) {
-; CHECK-LABEL: define i16 @vecreduce_smax_v2i16(
+; CHECK-LABEL: define range(i16 0, -32768) i16 @vecreduce_smax_v2i16(
 ; CHECK:  @llvm.smax.v2i16
 
 entry:
diff --git a/llvm/test/Transforms/SCCP/and-add-shl.ll b/llvm/test/Transforms/SCCP/and-add-shl.ll
index 7c037ffa6bf6..7af563f13a18 100644
--- a/llvm/test/Transforms/SCCP/and-add-shl.ll
+++ b/llvm/test/Transforms/SCCP/and-add-shl.ll
@@ -59,7 +59,7 @@ define i8 @and_not_shl_1(i8 %x) {
 
 ; Negative test: https://alive2.llvm.org/ce/z/Zv4Pyu
 define i8 @and_add_shl_overlap(i8 %x) {
-; CHECK-LABEL: define i8 @and_add_shl_overlap
+; CHECK-LABEL: define range(i8 0, 33) i8 @and_add_shl_overlap
 ; CHECK-SAME: (i8 [[X:%.*]]) {
 ; CHECK-NEXT:    [[OP1_P2:%.*]] = icmp ule i8 [[X]], 6
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OP1_P2]])
@@ -77,7 +77,7 @@ define i8 @and_add_shl_overlap(i8 %x) {
 }
 
 define i8 @and_not_shl_overlap(i8 %x) {
-; CHECK-LABEL: define i8 @and_not_shl_overlap
+; CHECK-LABEL: define range(i8 0, 5) i8 @and_not_shl_overlap
 ; CHECK-SAME: (i8 [[X:%.*]]) {
 ; CHECK-NEXT:    [[OP1_P2:%.*]] = icmp ule i8 [[X]], 3
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[OP1_P2]])
diff --git a/llvm/test/Transforms/SCCP/ip-add-range-to-call.ll b/llvm/test/Transforms/SCCP/ip-add-range-to-call.ll
index 64c1b9020a05..c24c554102dd 100644
--- a/llvm/test/Transforms/SCCP/ip-add-range-to-call.ll
+++ b/llvm/test/Transforms/SCCP/ip-add-range-to-call.ll
@@ -1,20 +1,21 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=ipsccp -S %s | FileCheck %s
 
 ; Test 1.
 ; Both arguments and return value of @callee can be tracked. The inferred range
 ; can be added to call sites.
 define internal i32 @callee(i32 %x) {
-; CHECK-LABEL: @callee(
-; CHECK-NEXT:    ret i32 [[X:%.*]]
+; CHECK-LABEL: define internal range(i32 0, 21) i32 @callee(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    ret i32 [[X]]
 ;
   ret i32 %x
 }
 
 define i32 @caller1() {
-; CHECK-LABEL: @caller1(
-; CHECK-NEXT:    [[C1:%.*]] = call i32 @callee(i32 10), !range [[RNG0:![0-9]+]]
-; CHECK-NEXT:    [[C2:%.*]] = call i32 @callee(i32 20), !range [[RNG0]]
+; CHECK-LABEL: define range(i32 0, 41) i32 @caller1() {
+; CHECK-NEXT:    [[C1:%.*]] = call i32 @callee(i32 10)
+; CHECK-NEXT:    [[C2:%.*]] = call i32 @callee(i32 20)
 ; CHECK-NEXT:    [[A:%.*]] = add nuw nsw i32 [[C1]], [[C2]]
 ; CHECK-NEXT:    ret i32 [[A]]
 ;
@@ -25,9 +26,10 @@ define i32 @caller1() {
 }
 
 define i32 @caller2(i32 %x) {
-; CHECK-LABEL: @caller2(
-; CHECK-NEXT:    [[X_15:%.*]] = and i32 [[X:%.*]], 15
-; CHECK-NEXT:    [[C:%.*]] = call i32 @callee(i32 [[X_15]]), !range [[RNG0]]
+; CHECK-LABEL: define range(i32 0, 21) i32 @caller2(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[X_15:%.*]] = and i32 [[X]], 15
+; CHECK-NEXT:    [[C:%.*]] = call i32 @callee(i32 [[X_15]])
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
   %x.15 = and i32 %x, 15
@@ -43,14 +45,15 @@ define i32 @caller2(i32 %x) {
 declare void @use_cb1(ptr)
 
 define internal i32 @callee2(i32 %x) {
-; CHECK-LABEL: @callee2(
-; CHECK-NEXT:    ret i32 [[X:%.*]]
+; CHECK-LABEL: define internal i32 @callee2(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    ret i32 [[X]]
 ;
   ret i32 %x
 }
 
 define void @caller_cb1() {
-; CHECK-LABEL: @caller_cb1(
+; CHECK-LABEL: define void @caller_cb1() {
 ; CHECK-NEXT:    [[C1:%.*]] = call i32 @callee2(i32 9)
 ; CHECK-NEXT:    [[C2:%.*]] = call i32 @callee2(i32 10)
 ; CHECK-NEXT:    call void @use_cb1(ptr @callee2)
@@ -70,8 +73,9 @@ define void @caller_cb1() {
 declare void @use_cb2(ptr)
 
 define internal i32 @callee3(i32 %x) {
-; CHECK-LABEL: @callee3(
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[X:%.*]], 10
+; CHECK-LABEL: define internal range(i32 500, 601) i32 @callee3(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[X]], 10
 ; CHECK-NEXT:    [[S:%.*]] = select i1 [[C]], i32 500, i32 600
 ; CHECK-NEXT:    ret i32 [[S]]
 ;
@@ -81,9 +85,9 @@ define internal i32 @callee3(i32 %x) {
 }
 
 define void @caller_cb2() {
-; CHECK-LABEL: @caller_cb2(
-; CHECK-NEXT:    [[C1:%.*]] = call i32 @callee3(i32 9), !range [[RNG1:![0-9]+]]
-; CHECK-NEXT:    [[C2:%.*]] = call i32 @callee3(i32 10), !range [[RNG1]]
+; CHECK-LABEL: define void @caller_cb2() {
+; CHECK-NEXT:    [[C1:%.*]] = call i32 @callee3(i32 9)
+; CHECK-NEXT:    [[C2:%.*]] = call i32 @callee3(i32 10)
 ; CHECK-NEXT:    call void @use_cb2(ptr @callee3)
 ; CHECK-NEXT:    ret void
 ;
@@ -100,9 +104,10 @@ define void @caller_cb2() {
 declare void @use_cb3(ptr)
 
 define internal i32 @callee4(i32 %x, i32 %y) {
-; CHECK-LABEL: @callee4(
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[X:%.*]], 10
-; CHECK-NEXT:    [[S:%.*]] = select i1 [[C]], i32 500, i32 [[Y:%.*]]
+; CHECK-LABEL: define internal i32 @callee4(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[X]], 10
+; CHECK-NEXT:    [[S:%.*]] = select i1 [[C]], i32 500, i32 [[Y]]
 ; CHECK-NEXT:    ret i32 [[S]]
 ;
   %c = icmp eq i32 %x, 10
@@ -111,11 +116,9 @@ define internal i32 @callee4(i32 %x, i32 %y) {
 }
 
 define void @caller_cb3() {
-; CHECK-LABEL: @caller_cb3(
+; CHECK-LABEL: define void @caller_cb3() {
 ; CHECK-NEXT:    [[C1:%.*]] = call i32 @callee4(i32 11, i32 30)
-; CHECK-NOT:   !range
 ; CHECK-NEXT:    [[C2:%.*]] = call i32 @callee4(i32 12, i32 40)
-; CHECK-NOT:   !range
 ; CHECK-NEXT:    call void @use_cb3(ptr @callee4)
 ; CHECK-NEXT:    ret void
 ;
@@ -129,15 +132,16 @@ define void @caller_cb3() {
 ; Range for the return value of callee5 includes undef. No range metadata
 ; should be added at call sites.
 define internal i32 @callee5(i32 %x, i32 %y) {
-; CHECK-LABEL: @callee5(
-; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[X:%.*]], 15
+; CHECK-LABEL: define internal i32 @callee5(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[X]], 15
 ; CHECK-NEXT:    br i1 [[C]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[Y:%.*]], [[BB1]] ], [ undef, [[BB2]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[Y]], [[BB1]] ], [ undef, [[BB2]] ]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %c = icmp slt i32 %x, 15
@@ -155,11 +159,9 @@ exit:
 }
 
 define i32 @caller5() {
-; CHECK-LABEL: @caller5(
+; CHECK-LABEL: define range(i32 200, 401) i32 @caller5() {
 ; CHECK-NEXT:    [[C1:%.*]] = call i32 @callee5(i32 10, i32 100)
-; CHECK-NOT:   !range
 ; CHECK-NEXT:    [[C2:%.*]] = call i32 @callee5(i32 20, i32 200)
-; CHECK-NOT:   !range
 ; CHECK-NEXT:    [[A:%.*]] = add i32 [[C1]], [[C2]]
 ; CHECK-NEXT:    ret i32 [[A]]
 ;
@@ -170,8 +172,9 @@ define i32 @caller5() {
 }
 
 define internal <2 x i64> @ctlz(<2 x i64> %arg) {
-; CHECK-LABEL: @ctlz(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[ARG:%.*]], i1 false)
+; CHECK-LABEL: define internal range(i64 0, 65) <2 x i64> @ctlz(
+; CHECK-SAME: <2 x i64> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[ARG]], i1 false)
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
 ;
   %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %arg, i1 false)
@@ -179,8 +182,9 @@ define internal <2 x i64> @ctlz(<2 x i64> %arg) {
 }
 
 define <2 x i64> @ctlz_caller(<2 x i64> %arg) {
-; CHECK-LABEL: @ctlz_caller(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @ctlz(<2 x i64> [[ARG:%.*]]), !range [[RNG2:![0-9]+]]
+; CHECK-LABEL: define range(i64 0, 65) <2 x i64> @ctlz_caller(
+; CHECK-SAME: <2 x i64> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @ctlz(<2 x i64> [[ARG]])
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
 ;
   %res = call <2 x i64> @ctlz(<2 x i64> %arg)
@@ -189,6 +193,3 @@ define <2 x i64> @ctlz_caller(<2 x i64> %arg) {
 
 declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1)
 
-; CHECK: [[RNG0]] = !{i32 0, i32 21}
-; CHECK: [[RNG1]] = !{i32 500, i32 601}
-; CHECK: [[RNG2]] = !{i64 0, i64 65}
diff --git a/llvm/test/Transforms/SCCP/ip-ranges-casts.ll b/llvm/test/Transforms/SCCP/ip-ranges-casts.ll
index 80d90922c2fb..05fa04a9fbe0 100644
--- a/llvm/test/Transforms/SCCP/ip-ranges-casts.ll
+++ b/llvm/test/Transforms/SCCP/ip-ranges-casts.ll
@@ -1,10 +1,11 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=ipsccp -S | FileCheck %s
 
 ; x = [100, 301)
 define internal i1 @f.trunc(i32 %x) {
-; CHECK-LABEL: @f.trunc(
-; CHECK-NEXT:    [[T_1:%.*]] = trunc nuw nsw i32 [[X:%.*]] to i16
+; CHECK-LABEL: define internal i1 @f.trunc(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[T_1:%.*]] = trunc nuw nsw i32 [[X]] to i16
 ; CHECK-NEXT:    [[C_2:%.*]] = icmp sgt i16 [[T_1]], 299
 ; CHECK-NEXT:    [[C_4:%.*]] = icmp slt i16 [[T_1]], 101
 ; CHECK-NEXT:    [[RES_1:%.*]] = add nuw nsw i1 false, [[C_2]]
@@ -43,7 +44,7 @@ define internal i1 @f.trunc(i32 %x) {
 }
 
 define i1 @caller1() {
-; CHECK-LABEL: @caller1(
+; CHECK-LABEL: define i1 @caller1() {
 ; CHECK-NEXT:    [[CALL_1:%.*]] = tail call i1 @f.trunc(i32 100)
 ; CHECK-NEXT:    [[CALL_2:%.*]] = tail call i1 @f.trunc(i32 300)
 ; CHECK-NEXT:    [[RES:%.*]] = and i1 [[CALL_1]], [[CALL_2]]
@@ -58,14 +59,15 @@ define i1 @caller1() {
 
 ; x = [100, 301)
 define internal i1 @f.zext(i32 %x, i32 %y) {
-; CHECK-LABEL: @f.zext(
-; CHECK-NEXT:    [[T_1:%.*]] = zext nneg i32 [[X:%.*]] to i64
+; CHECK-LABEL: define internal i1 @f.zext(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[T_1:%.*]] = zext nneg i32 [[X]] to i64
 ; CHECK-NEXT:    [[C_2:%.*]] = icmp sgt i64 [[T_1]], 299
 ; CHECK-NEXT:    [[C_4:%.*]] = icmp slt i64 [[T_1]], 101
 ; CHECK-NEXT:    [[RES_1:%.*]] = add nuw nsw i1 false, [[C_2]]
 ; CHECK-NEXT:    [[RES_2:%.*]] = add nuw nsw i1 [[RES_1]], false
 ; CHECK-NEXT:    [[RES_3:%.*]] = add i1 [[RES_2]], [[C_4]]
-; CHECK-NEXT:    [[T_2:%.*]] = zext i32 [[Y:%.*]] to i64
+; CHECK-NEXT:    [[T_2:%.*]] = zext i32 [[Y]] to i64
 ; CHECK-NEXT:    [[C_5:%.*]] = icmp sgt i64 [[T_2]], 300
 ; CHECK-NEXT:    [[C_6:%.*]] = icmp sgt i64 [[T_2]], 299
 ; CHECK-NEXT:    [[C_8:%.*]] = icmp slt i64 [[T_2]], 1
@@ -97,7 +99,7 @@ define internal i1 @f.zext(i32 %x, i32 %y) {
 }
 
 define i1 @caller.zext() {
-; CHECK-LABEL: @caller.zext(
+; CHECK-LABEL: define i1 @caller.zext() {
 ; CHECK-NEXT:    [[CALL_1:%.*]] = tail call i1 @f.zext(i32 100, i32 -120)
 ; CHECK-NEXT:    [[CALL_2:%.*]] = tail call i1 @f.zext(i32 300, i32 900)
 ; CHECK-NEXT:    [[RES:%.*]] = and i1 [[CALL_1]], [[CALL_2]]
@@ -111,14 +113,15 @@ define i1 @caller.zext() {
 
 ; x = [100, 301)
 define internal i1 @f.sext(i32 %x, i32 %y) {
-; CHECK-LABEL: @f.sext(
-; CHECK-NEXT:    [[T_1:%.*]] = zext nneg i32 [[X:%.*]] to i64
+; CHECK-LABEL: define internal i1 @f.sext(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[T_1:%.*]] = zext nneg i32 [[X]] to i64
 ; CHECK-NEXT:    [[C_2:%.*]] = icmp sgt i64 [[T_1]], 299
 ; CHECK-NEXT:    [[C_4:%.*]] = icmp slt i64 [[T_1]], 101
 ; CHECK-NEXT:    [[RES_1:%.*]] = add nuw nsw i1 false, [[C_2]]
 ; CHECK-NEXT:    [[RES_2:%.*]] = add nuw nsw i1 [[RES_1]], false
 ; CHECK-NEXT:    [[RES_3:%.*]] = add i1 [[RES_2]], [[C_4]]
-; CHECK-NEXT:    [[T_2:%.*]] = sext i32 [[Y:%.*]] to i64
+; CHECK-NEXT:    [[T_2:%.*]] = sext i32 [[Y]] to i64
 ; CHECK-NEXT:    [[C_6:%.*]] = icmp sgt i64 [[T_2]], 899
 ; CHECK-NEXT:    [[C_8:%.*]] = icmp slt i64 [[T_2]], -119
 ; CHECK-NEXT:    [[RES_4:%.*]] = add nuw nsw i1 [[RES_3]], false
@@ -148,7 +151,7 @@ define internal i1 @f.sext(i32 %x, i32 %y) {
 }
 
 define i1 @caller.sext() {
-; CHECK-LABEL: @caller.sext(
+; CHECK-LABEL: define i1 @caller.sext() {
 ; CHECK-NEXT:    [[CALL_1:%.*]] = tail call i1 @f.sext(i32 100, i32 -120)
 ; CHECK-NEXT:    [[CALL_2:%.*]] = tail call i1 @f.sext(i32 300, i32 900)
 ; CHECK-NEXT:    [[RES:%.*]] = and i1 [[CALL_1]], [[CALL_2]]
@@ -162,8 +165,9 @@ define i1 @caller.sext() {
 
 ; There's nothing we can do besides going to the full range or overdefined.
 define internal i1 @f.fptosi(i32 %x) {
-; CHECK-LABEL: @f.fptosi(
-; CHECK-NEXT:    [[TO_DOUBLE:%.*]] = sitofp i32 [[X:%.*]] to double
+; CHECK-LABEL: define internal i1 @f.fptosi(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TO_DOUBLE:%.*]] = sitofp i32 [[X]] to double
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd double 0.000000e+00, [[TO_DOUBLE]]
 ; CHECK-NEXT:    [[TO_I32:%.*]] = fptosi double [[ADD]] to i32
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp sgt i32 [[TO_I32]], 300
@@ -189,7 +193,7 @@ define internal i1 @f.fptosi(i32 %x) {
 }
 
 define i1 @caller.fptosi() {
-; CHECK-LABEL: @caller.fptosi(
+; CHECK-LABEL: define i1 @caller.fptosi() {
 ; CHECK-NEXT:    [[CALL_1:%.*]] = tail call i1 @f.fptosi(i32 100)
 ; CHECK-NEXT:    [[CALL_2:%.*]] = tail call i1 @f.fptosi(i32 300)
 ; CHECK-NEXT:    [[RES:%.*]] = and i1 [[CALL_1]], [[CALL_2]]
@@ -203,8 +207,9 @@ define i1 @caller.fptosi() {
 
 ; There's nothing we can do besides going to the full range or overdefined.
 define internal i1 @f.fpext(i16 %x) {
-; CHECK-LABEL: @f.fpext(
-; CHECK-NEXT:    [[TO_FLOAT:%.*]] = sitofp i16 [[X:%.*]] to float
+; CHECK-LABEL: define internal i1 @f.fpext(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TO_FLOAT:%.*]] = sitofp i16 [[X]] to float
 ; CHECK-NEXT:    [[TO_DOUBLE:%.*]] = fpext float [[TO_FLOAT]] to double
 ; CHECK-NEXT:    [[TO_I64:%.*]] = fptoui float [[TO_FLOAT]] to i64
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp sgt i64 [[TO_I64]], 300
@@ -231,7 +236,7 @@ define internal i1 @f.fpext(i16 %x) {
 
 ; There's nothing we can do besides going to the full range or overdefined.
 define i1 @caller.fpext() {
-; CHECK-LABEL: @caller.fpext(
+; CHECK-LABEL: define i1 @caller.fpext() {
 ; CHECK-NEXT:    [[CALL_1:%.*]] = tail call i1 @f.fpext(i16 100)
 ; CHECK-NEXT:    [[CALL_2:%.*]] = tail call i1 @f.fpext(i16 300)
 ; CHECK-NEXT:    [[RES:%.*]] = and i1 [[CALL_1]], [[CALL_2]]
@@ -245,8 +250,9 @@ define i1 @caller.fpext() {
 
 ; There's nothing we can do besides going to the full range or overdefined.
 define internal i1 @f.inttoptr.ptrtoint(i64 %x) {
-; CHECK-LABEL: @f.inttoptr.ptrtoint(
-; CHECK-NEXT:    [[TO_PTR:%.*]] = inttoptr i64 [[X:%.*]] to ptr
+; CHECK-LABEL: define internal i1 @f.inttoptr.ptrtoint(
+; CHECK-SAME: i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[TO_PTR:%.*]] = inttoptr i64 [[X]] to ptr
 ; CHECK-NEXT:    [[TO_I64:%.*]] = ptrtoint ptr [[TO_PTR]] to i64
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp sgt i64 [[TO_I64]], 300
 ; CHECK-NEXT:    [[C_2:%.*]] = icmp sgt i64 [[TO_I64]], 299
@@ -270,7 +276,7 @@ define internal i1 @f.inttoptr.ptrtoint(i64 %x) {
 }
 
 define i1 @caller.inttoptr.ptrtoint() {
-; CHECK-LABEL: @caller.inttoptr.ptrtoint(
+; CHECK-LABEL: define i1 @caller.inttoptr.ptrtoint() {
 ; CHECK-NEXT:    [[CALL_1:%.*]] = tail call i1 @f.inttoptr.ptrtoint(i64 100)
 ; CHECK-NEXT:    [[CALL_2:%.*]] = tail call i1 @f.inttoptr.ptrtoint(i64 300)
 ; CHECK-NEXT:    [[RES:%.*]] = and i1 [[CALL_1]], [[CALL_2]]
@@ -284,8 +290,9 @@ define i1 @caller.inttoptr.ptrtoint() {
 
 ; Make sure we do not create constant ranges for int to fp casts.
 define i1 @int_range_to_double_cast(i32 %a) {
-; CHECK-LABEL: @int_range_to_double_cast(
-; CHECK-NEXT:    [[R:%.*]] = and i32 [[A:%.*]], 255
+; CHECK-LABEL: define i1 @int_range_to_double_cast(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = and i32 [[A]], 255
 ; CHECK-NEXT:    [[T4:%.*]] = sitofp i32 [[R]] to double
 ; CHECK-NEXT:    [[T10:%.*]] = fadd double 0.000000e+00, [[T4]]
 ; CHECK-NEXT:    [[T11:%.*]] = fcmp olt double [[T4]], [[T10]]
@@ -300,7 +307,7 @@ define i1 @int_range_to_double_cast(i32 %a) {
 
 ; Make sure we do not use ranges to propagate info from vectors.
 define i16 @vector_binop_and_cast() {
-; CHECK-LABEL: @vector_binop_and_cast(
+; CHECK-LABEL: define i16 @vector_binop_and_cast() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x i16> <i16 undef, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, i16 undef, i32 0
 ; CHECK-NEXT:    [[REM:%.*]] = srem <8 x i16> <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>, [[VECINIT7]]
@@ -317,8 +324,9 @@ entry:
 }
 
 define internal i64 @f.sext_to_zext(i32 %t) {
-; CHECK-LABEL: @f.sext_to_zext(
-; CHECK-NEXT:    [[A:%.*]] = zext nneg i32 [[T:%.*]] to i64
+; CHECK-LABEL: define internal range(i64 0, 2) i64 @f.sext_to_zext(
+; CHECK-SAME: i32 [[T:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = zext nneg i32 [[T]] to i64
 ; CHECK-NEXT:    ret i64 [[A]]
 ;
   %a = sext i32 %t to i64
@@ -326,10 +334,11 @@ define internal i64 @f.sext_to_zext(i32 %t) {
 }
 
 define i64 @caller.sext_to_zext(i32 %i) {
-; CHECK-LABEL: @caller.sext_to_zext(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[I:%.*]], 9
+; CHECK-LABEL: define range(i64 0, 2) i64 @caller.sext_to_zext(
+; CHECK-SAME: i32 [[I:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[I]], 9
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    [[T:%.*]] = call i64 @f.sext_to_zext(i32 [[CONV]]), !range [[RNG0:![0-9]+]]
+; CHECK-NEXT:    [[T:%.*]] = call i64 @f.sext_to_zext(i32 [[CONV]])
 ; CHECK-NEXT:    ret i64 [[T]]
 ;
   %cmp = icmp sle i32 %i, 9
diff --git a/llvm/test/Transforms/SCCP/ipsccp-basic.ll b/llvm/test/Transforms/SCCP/ipsccp-basic.ll
index 71c042b9b294..6a7ab8ac2864 100644
--- a/llvm/test/Transforms/SCCP/ipsccp-basic.ll
+++ b/llvm/test/Transforms/SCCP/ipsccp-basic.ll
@@ -71,7 +71,7 @@ define void @test3a() {
 }
 
 define i32 @test3b() {
-; CHECK-LABEL: define i32 @test3b() {
+; CHECK-LABEL: define range(i32 0, 18) i32 @test3b() {
 ; CHECK-NEXT:    [[V:%.*]] = load i32, ptr @G, align 4
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[V]], 17
 ; CHECK-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
@@ -105,7 +105,7 @@ define internal {i64,i64} @test4a() {
 }
 
 define i64 @test4b() personality ptr @__gxx_personality_v0 {
-; CHECK-LABEL: define i64 @test4b() personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: define range(i64 0, 6) i64 @test4b() personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    [[A:%.*]] = invoke { i64, i64 } @test4a()
 ; CHECK-NEXT:    to label [[A:%.*]] unwind label [[B:%.*]]
 ; CHECK:       A:
@@ -149,7 +149,7 @@ define internal {i64,i64} @test5a() {
 }
 
 define i64 @test5b() personality ptr @__gxx_personality_v0 {
-; CHECK-LABEL: define i64 @test5b() personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: define range(i64 0, 6) i64 @test5b() personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    [[A:%.*]] = invoke { i64, i64 } @test5a()
 ; CHECK-NEXT:    to label [[A:%.*]] unwind label [[B:%.*]]
 ; CHECK:       A:
diff --git a/llvm/test/Transforms/SCCP/switch.ll b/llvm/test/Transforms/SCCP/switch.ll
index 306f0eebf2b4..5208213de210 100644
--- a/llvm/test/Transforms/SCCP/switch.ll
+++ b/llvm/test/Transforms/SCCP/switch.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S -passes=ipsccp < %s | FileCheck %s
 
 ; Make sure we always consider the default edge executable for a switch
@@ -7,7 +7,7 @@ declare void @foo()
 declare i32 @g(i32)
 
 define void @test1() {
-; CHECK-LABEL: @test1(
+; CHECK-LABEL: define void @test1() {
 ; CHECK-NEXT:    switch i32 undef, label [[D:%.*]] [
 ; CHECK-NEXT:    ]
 ; CHECK:       d:
@@ -21,15 +21,16 @@ d:
 }
 
 define i32 @test_duplicate_successors_phi(i1 %c, i32 %x) {
-; CHECK-LABEL: @test_duplicate_successors_phi(
+; CHECK-LABEL: define i32 @test_duplicate_successors_phi(
+; CHECK-SAME: i1 [[C:%.*]], i32 [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[SWITCH:%.*]], label [[END:%.*]]
+; CHECK-NEXT:    br i1 [[C]], label [[SWITCH:%.*]], label [[END:%.*]]
 ; CHECK:       switch:
 ; CHECK-NEXT:    br label [[SWITCH_DEFAULT:%.*]]
 ; CHECK:       switch.default:
 ; CHECK-NEXT:    ret i32 -1
 ; CHECK:       end:
-; CHECK-NEXT:    ret i32 [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[X]]
 ;
 entry:
   br i1 %c, label %switch, label %end
@@ -49,13 +50,14 @@ end:
 }
 
 define i32 @test_duplicate_successors_phi_2(i1 %c, i32 %x) {
-; CHECK-LABEL: @test_duplicate_successors_phi_2(
+; CHECK-LABEL: define i32 @test_duplicate_successors_phi_2(
+; CHECK-SAME: i1 [[C:%.*]], i32 [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[SWITCH:%.*]], label [[END:%.*]]
+; CHECK-NEXT:    br i1 [[C]], label [[SWITCH:%.*]], label [[END:%.*]]
 ; CHECK:       switch:
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[X:%.*]], [[ENTRY:%.*]] ], [ 1, [[SWITCH]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ 1, [[SWITCH]] ]
 ; CHECK-NEXT:    ret i32 [[PHI]]
 ;
 entry:
@@ -76,22 +78,23 @@ end:
 }
 
 define i32 @test_duplicate_successors_phi_3(i1 %c1, ptr %p, i32 %y) {
-; CHECK-LABEL: @test_duplicate_successors_phi_3(
+; CHECK-LABEL: define i32 @test_duplicate_successors_phi_3(
+; CHECK-SAME: i1 [[C1:%.*]], ptr [[P:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[C1:%.*]], label [[SWITCH:%.*]], label [[SWITCH_1:%.*]]
+; CHECK-NEXT:    br i1 [[C1]], label [[SWITCH:%.*]], label [[SWITCH_1:%.*]]
 ; CHECK:       switch:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[P:%.*]], align 4, !range [[RNG0:![0-9]+]]
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG0:![0-9]+]]
 ; CHECK-NEXT:    switch i32 [[X]], label [[SWITCH_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 0, label [[SWITCH_DEFAULT]]
-; CHECK-NEXT:    i32 1, label [[SWITCH_0:%.*]]
-; CHECK-NEXT:    i32 2, label [[SWITCH_0]]
+; CHECK-NEXT:      i32 0, label [[SWITCH_DEFAULT]]
+; CHECK-NEXT:      i32 1, label [[SWITCH_0:%.*]]
+; CHECK-NEXT:      i32 2, label [[SWITCH_0]]
 ; CHECK-NEXT:    ]
 ; CHECK:       switch.default:
 ; CHECK-NEXT:    ret i32 -1
 ; CHECK:       switch.0:
 ; CHECK-NEXT:    ret i32 0
 ; CHECK:       switch.1:
-; CHECK-NEXT:    ret i32 [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[Y]]
 ;
 entry:
   br i1 %c1, label %switch, label %switch.1
@@ -118,12 +121,13 @@ switch.1:
 }
 
 define i32 @test_local_range(ptr %p) {
-; CHECK-LABEL: @test_local_range(
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[P:%.*]], align 4, !range [[RNG0]]
+; CHECK-LABEL: define range(i32 0, 3) i32 @test_local_range(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG0]]
 ; CHECK-NEXT:    switch i32 [[X]], label [[DEFAULT_UNREACHABLE:%.*]] [
-; CHECK-NEXT:    i32 0, label [[SWITCH_0:%.*]]
-; CHECK-NEXT:    i32 1, label [[SWITCH_1:%.*]]
-; CHECK-NEXT:    i32 2, label [[SWITCH_2:%.*]]
+; CHECK-NEXT:      i32 0, label [[SWITCH_0:%.*]]
+; CHECK-NEXT:      i32 1, label [[SWITCH_1:%.*]]
+; CHECK-NEXT:      i32 2, label [[SWITCH_2:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       default.unreachable:
 ; CHECK-NEXT:    unreachable
@@ -160,13 +164,14 @@ switch.3:
 
 ; TODO: Determine that case i3 is dead, even though the edge is shared?
 define i32 @test_duplicate_successors(ptr %p) {
-; CHECK-LABEL: @test_duplicate_successors(
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[P:%.*]], align 4, !range [[RNG0]]
+; CHECK-LABEL: define range(i32 0, 2) i32 @test_duplicate_successors(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[P]], align 4, !range [[RNG0]]
 ; CHECK-NEXT:    switch i32 [[X]], label [[DEFAULT_UNREACHABLE:%.*]] [
-; CHECK-NEXT:    i32 0, label [[SWITCH_0:%.*]]
-; CHECK-NEXT:    i32 1, label [[SWITCH_0]]
-; CHECK-NEXT:    i32 2, label [[SWITCH_1:%.*]]
-; CHECK-NEXT:    i32 3, label [[SWITCH_1]]
+; CHECK-NEXT:      i32 0, label [[SWITCH_0:%.*]]
+; CHECK-NEXT:      i32 1, label [[SWITCH_0]]
+; CHECK-NEXT:      i32 2, label [[SWITCH_1:%.*]]
+; CHECK-NEXT:      i32 3, label [[SWITCH_1]]
 ; CHECK-NEXT:    ]
 ; CHECK:       default.unreachable:
 ; CHECK-NEXT:    unreachable
@@ -201,11 +206,12 @@ switch.2:
 ; Case i32 2 is dead as well, but this cannot be determined based on
 ; range information.
 define internal i32 @test_ip_range(i32 %x) {
-; CHECK-LABEL: @test_ip_range(
-; CHECK-NEXT:    switch i32 [[X:%.*]], label [[DEFAULT_UNREACHABLE:%.*]] [
-; CHECK-NEXT:    i32 3, label [[SWITCH_3:%.*]]
-; CHECK-NEXT:    i32 1, label [[SWITCH_1:%.*]]
-; CHECK-NEXT:    i32 2, label [[SWITCH_2:%.*]]
+; CHECK-LABEL: define internal range(i32 1, 4) i32 @test_ip_range(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    switch i32 [[X]], label [[DEFAULT_UNREACHABLE:%.*]] [
+; CHECK-NEXT:      i32 3, label [[SWITCH_3:%.*]]
+; CHECK-NEXT:      i32 1, label [[SWITCH_1:%.*]]
+; CHECK-NEXT:      i32 2, label [[SWITCH_2:%.*]]
 ; CHECK-NEXT:    ], !prof [[PROF1:![0-9]+]]
 ; CHECK:       default.unreachable:
 ; CHECK-NEXT:    unreachable
@@ -240,9 +246,9 @@ switch.3:
 }
 
 define void @call_test_ip_range() {
-; CHECK-LABEL: @call_test_ip_range(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @test_ip_range(i32 1), !range [[RNG2:![0-9]+]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @test_ip_range(i32 3), !range [[RNG2]]
+; CHECK-LABEL: define void @call_test_ip_range() {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @test_ip_range(i32 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @test_ip_range(i32 3)
 ; CHECK-NEXT:    ret void
 ;
   call i32 @test_ip_range(i32 1)
@@ -251,11 +257,12 @@ define void @call_test_ip_range() {
 }
 
 define i32 @test_switch_range_may_include_undef(i1 %c.1, i1 %c.2, i32 %x) {
-; CHECK-LABEL: @test_switch_range_may_include_undef(
+; CHECK-LABEL: define range(i32 -1, 21) i32 @test_switch_range_may_include_undef(
+; CHECK-SAME: i1 [[C_1:%.*]], i1 [[C_2:%.*]], i32 [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[C_1:%.*]], label [[THEN_1:%.*]], label [[ELSE_1:%.*]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[THEN_1:%.*]], label [[ELSE_1:%.*]]
 ; CHECK:       then.1:
-; CHECK-NEXT:    br i1 [[C_2:%.*]], label [[SWITCH:%.*]], label [[ELSE_2:%.*]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[SWITCH:%.*]], label [[ELSE_2:%.*]]
 ; CHECK:       else.1:
 ; CHECK-NEXT:    br label [[SWITCH]]
 ; CHECK:       else.2:
@@ -263,8 +270,8 @@ define i32 @test_switch_range_may_include_undef(i1 %c.1, i1 %c.2, i32 %x) {
 ; CHECK:       switch:
 ; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 0, [[THEN_1]] ], [ 2, [[ELSE_1]] ], [ undef, [[ELSE_2]] ]
 ; CHECK-NEXT:    switch i32 [[P]], label [[SWITCH_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 0, label [[END_1:%.*]]
-; CHECK-NEXT:    i32 3, label [[END_2:%.*]]
+; CHECK-NEXT:      i32 0, label [[END_1:%.*]]
+; CHECK-NEXT:      i32 3, label [[END_2:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       switch.default:
 ; CHECK-NEXT:    ret i32 -1
@@ -303,9 +310,10 @@ end.2:
 }
 
 define i32 @test_default_unreachable_by_dom_cond(i32 %x) {
-; CHECK-LABEL: @test_default_unreachable_by_dom_cond(
+; CHECK-LABEL: define i32 @test_default_unreachable_by_dom_cond(
+; CHECK-SAME: i32 [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[OR_COND:%.*]] = icmp ult i32 [[X:%.*]], 4
+; CHECK-NEXT:    [[OR_COND:%.*]] = icmp ult i32 [[X]], 4
 ; CHECK-NEXT:    br i1 [[OR_COND]], label [[IF_THEN:%.*]], label [[RETURN:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    switch i32 [[X]], label [[DEFAULT_UNREACHABLE:%.*]] [
@@ -371,4 +379,7 @@ return:
 
 declare void @llvm.assume(i1)
 
-; CHECK: !1 = !{!"branch_weights", i32 1, i32 5, i32 3, i32 4}
+;.
+; CHECK: [[RNG0]] = !{i32 0, i32 3}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 5, i32 3, i32 4}
+;.
diff --git a/llvm/test/Transforms/SCCP/trunc-nuw-nsw-flags.ll b/llvm/test/Transforms/SCCP/trunc-nuw-nsw-flags.ll
index fc3e56011d46..d3bac0d68a97 100644
--- a/llvm/test/Transforms/SCCP/trunc-nuw-nsw-flags.ll
+++ b/llvm/test/Transforms/SCCP/trunc-nuw-nsw-flags.ll
@@ -16,7 +16,7 @@ entry:
 }
 
 define i8 @range_from_or_nsw(i16 %a) {
-; CHECK-LABEL: define i8 @range_from_or_nsw(
+; CHECK-LABEL: define range(i8 -128, 0) i8 @range_from_or_nsw(
 ; CHECK-SAME: i16 [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[AND1:%.*]] = or i16 [[A]], -128
@@ -30,7 +30,7 @@ entry:
 }
 
 define i16 @range_from_and_nuw_nsw(i32 %a) {
-; CHECK-LABEL: define i16 @range_from_and_nuw_nsw(
+; CHECK-LABEL: define range(i16 0, -32768) i16 @range_from_and_nuw_nsw(
 ; CHECK-SAME: i32 [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[AND1:%.*]] = and i32 [[A]], 32767
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
index 690772472975..3771ec4bda88 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
@@ -5,12 +5,7 @@ define void @h() {
 ; CHECK-LABEL: define void @h() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc <8 x i32> [[TMP0]] to <8 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i1> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i1> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i1> [[TMP4]] to <8 x i16>
-; CHECK-NEXT:    store <8 x i16> [[TMP3]], ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[ARRAYIDX2]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll
index d51ef0bce3a4..76bb882171b1 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll
@@ -5,7 +5,8 @@ define void @h() {
 ; CHECK-LABEL: define void @h() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc <8 x i32> zeroinitializer to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 0 to i1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false>, i1 [[TMP6]], i32 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub <8 x i1> [[TMP0]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i1> [[TMP0]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll
index 6404cf4a2cd1..2ab6e919c23b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/user-node-not-in-bitwidths.ll
@@ -5,7 +5,12 @@ define void @h() {
 ; CHECK-LABEL: define void @h() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
-; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 0 to i1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false>, i1 [[TMP0]], i32 4
+; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i1> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i1> zeroinitializer, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i1> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    store <8 x i16> [[TMP4]], ptr [[ARRAYIDX2]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
index 47d918eabdfe..9bbd314a27cb 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
@@ -537,24 +537,18 @@ entry:
 }
 
 define void @vec3_extract(<3 x i16> %pixel.sroa.0.4.vec.insert606, ptr %call3.i536) {
-; NON-POW2-LABEL: define void @vec3_extract(
-; NON-POW2-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) {
-; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    store <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], ptr [[CALL3_I536]], align 2
-; NON-POW2-NEXT:    ret void
-;
-; POW2-ONLY-LABEL: define void @vec3_extract(
-; POW2-ONLY-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) {
-; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[PIXEL_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 2
-; POW2-ONLY-NEXT:    [[RED668:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 2
-; POW2-ONLY-NEXT:    store i16 [[PIXEL_SROA_0_4_VEC_EXTRACT]], ptr [[RED668]], align 2
-; POW2-ONLY-NEXT:    [[PIXEL_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 1
-; POW2-ONLY-NEXT:    [[GREEN670:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 1
-; POW2-ONLY-NEXT:    store i16 [[PIXEL_SROA_0_2_VEC_EXTRACT]], ptr [[GREEN670]], align 2
-; POW2-ONLY-NEXT:    [[PIXEL_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 0
-; POW2-ONLY-NEXT:    store i16 [[PIXEL_SROA_0_0_VEC_EXTRACT]], ptr [[CALL3_I536]], align 2
-; POW2-ONLY-NEXT:    ret void
+; CHECK-LABEL: define void @vec3_extract(
+; CHECK-SAME: <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606:%.*]], ptr [[CALL3_I536:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PIXEL_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 2
+; CHECK-NEXT:    [[RED668:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 2
+; CHECK-NEXT:    store i16 [[PIXEL_SROA_0_4_VEC_EXTRACT]], ptr [[RED668]], align 2
+; CHECK-NEXT:    [[PIXEL_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 1
+; CHECK-NEXT:    [[GREEN670:%.*]] = getelementptr i16, ptr [[CALL3_I536]], i64 1
+; CHECK-NEXT:    store i16 [[PIXEL_SROA_0_2_VEC_EXTRACT]], ptr [[GREEN670]], align 2
+; CHECK-NEXT:    [[PIXEL_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <3 x i16> [[PIXEL_SROA_0_4_VEC_INSERT606]], i64 0
+; CHECK-NEXT:    store i16 [[PIXEL_SROA_0_0_VEC_EXTRACT]], ptr [[CALL3_I536]], align 2
+; CHECK-NEXT:    ret void
 ;
 entry:
   %pixel.sroa.0.4.vec.extract = extractelement <3 x i16> %pixel.sroa.0.4.vec.insert606, i64 2
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/minbw-with-and-and-scalar-trunc.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/minbw-with-and-and-scalar-trunc.ll
new file mode 100644
index 000000000000..fc977585614b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/minbw-with-and-and-scalar-trunc.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mattr=+v < %s | FileCheck %s
+@c = global [12 x i64] zeroinitializer
+
+; FIXME: after minbitwidth analysis and i32 conv.., 65535 is transformed to
+; and <4 x i16> , -1, which must be dropped.
+; FIXME: need to adjust the cost of the final transformation, since the user is
+; just a trunc to i16 (it must be free).
+define i16 @test() {
+; CHECK-LABEL: define i16 @test(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr align 8 @c, i64 24, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[TMP0]] to <4 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i16> [[TMP1]], <i16 -1, i16 -1, i16 -1, i16 -1>
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i16> [[TMP2]], <i16 -1, i16 -1, i16 -1, i16 -1>
+; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[T:%.*]] = trunc i32 [[TMP5]] to i16
+; CHECK-NEXT:    ret i16 [[T]]
+;
+entry:
+  %0 = load i64, ptr @c, align 8
+  %conv = trunc i64 %0 to i32
+  %conv3 = and i32 %conv, 65535
+  %conv4 = xor i32 %conv3, 65535
+  %1 = load i64, ptr getelementptr inbounds ([12 x i64], ptr @c, i64 0, i64 3), align 8
+  %conv.1 = trunc i64 %1 to i32
+  %conv3.1 = and i32 %conv.1, 65535
+  %conv4.1 = xor i32 %conv3.1, 65535
+  %.conv4.1 = tail call i32 @llvm.umax.i32(i32 %conv4, i32 %conv4.1)
+  %2 = load i64, ptr getelementptr inbounds ([12 x i64], ptr @c, i64 0, i64 6), align 8
+  %conv.2 = trunc i64 %2 to i32
+  %conv3.2 = and i32 %conv.2, 65535
+  %conv4.2 = xor i32 %conv3.2, 65535
+  %.conv4.2 = tail call i32 @llvm.umax.i32(i32 %.conv4.1, i32 %conv4.2)
+  %3 = load i64, ptr getelementptr inbounds ([12 x i64], ptr @c, i64 0, i64 9), align 8
+  %conv.3 = trunc i64 %3 to i32
+  %conv3.3 = and i32 %conv.3, 65535
+  %conv4.3 = xor i32 %conv3.3, 65535
+  %.conv4.3 = tail call i32 @llvm.umax.i32(i32 %.conv4.2, i32 %conv4.3)
+  %t = trunc i32 %.conv4.3 to i16
+  ret i16 %t
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll
new file mode 100644
index 000000000000..0dfa45da9d87
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-stores-vectorized.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=riscv64-unknown-linux -mattr=+v | FileCheck %s
+
+define void @store_reverse(ptr %p3) {
+; CHECK-LABEL: @store_reverse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[P3:%.*]], align 8
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 7
+; CHECK-NEXT:    store i64 [[SHL]], ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 9
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[SHL5:%.*]] = shl i64 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 6
+; CHECK-NEXT:    store i64 [[SHL5]], ptr [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[ARRAYIDX7]], align 8
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 10
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    [[SHL9:%.*]] = shl i64 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 5
+; CHECK-NEXT:    store i64 [[SHL9]], ptr [[ARRAYIDX10]], align 8
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[ARRAYIDX11]], align 8
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 11
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[SHL13:%.*]] = shl i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i64, ptr [[P3]], i64 4
+; CHECK-NEXT:    store i64 [[SHL13]], ptr [[ARRAYIDX14]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i64, ptr %p3, align 8
+  %arrayidx1 = getelementptr inbounds i64, ptr %p3, i64 8
+  %1 = load i64, ptr %arrayidx1, align 8
+  %shl = shl i64 %0, %1
+  %arrayidx2 = getelementptr inbounds i64, ptr %p3, i64 7
+  store i64 %shl, ptr %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds i64, ptr %p3, i64 1
+  %2 = load i64, ptr %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds i64, ptr %p3, i64 9
+  %3 = load i64, ptr %arrayidx4, align 8
+  %shl5 = shl i64 %2, %3
+  %arrayidx6 = getelementptr inbounds i64, ptr %p3, i64 6
+  store i64 %shl5, ptr %arrayidx6, align 8
+  %arrayidx7 = getelementptr inbounds i64, ptr %p3, i64 2
+  %4 = load i64, ptr %arrayidx7, align 8
+  %arrayidx8 = getelementptr inbounds i64, ptr %p3, i64 10
+  %5 = load i64, ptr %arrayidx8, align 8
+  %shl9 = shl i64 %4, %5
+  %arrayidx10 = getelementptr inbounds i64, ptr %p3, i64 5
+  store i64 %shl9, ptr %arrayidx10, align 8
+  %arrayidx11 = getelementptr inbounds i64, ptr %p3, i64 3
+  %6 = load i64, ptr %arrayidx11, align 8
+  %arrayidx12 = getelementptr inbounds i64, ptr %p3, i64 11
+  %7 = load i64, ptr %arrayidx12, align 8
+  %shl13 = shl i64 %6, %7
+  %arrayidx14 = getelementptr inbounds i64, ptr %p3, i64 4
+  store i64 %shl13, ptr %arrayidx14, align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/unsigned-icmp-signed-op.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/unsigned-icmp-signed-op.ll
new file mode 100644
index 000000000000..5ec6b4f1040d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/unsigned-icmp-signed-op.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mattr=+v < %s | FileCheck %s
+
+define i32 @test(ptr %f, i16 %0) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[F:%.*]], i16 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[F]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> <i16 0, i16 poison, i16 0, i16 0>, i16 [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> <i16 0, i16 poison, i16 0, i16 0>, i16 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ule <4 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP4]])
+; CHECK-NEXT:    [[ZEXT_4:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    ret i32 [[ZEXT_4]]
+;
+entry:
+  %1 = load i16, ptr %f, align 2
+
+  %zext.0 = zext i16 %1 to i32
+  %sext.0 = sext i16 %0 to i32
+
+  %zext.1 = zext i16 0 to i32
+  %sext.1 = sext i16 0 to i32
+  %zext.2 = zext i16 0 to i32
+  %sext.2 = sext i16 0 to i32
+  %zext.3 = zext i16 0 to i32
+  %sext.3 = sext i16 0 to i32
+
+  %cmp.0 = icmp ule i32 %zext.0, %sext.0
+  %cmp.1 = icmp ule i32 %zext.1, %sext.1
+  %cmp.2 = icmp ule i32 %zext.2, %sext.2
+  %cmp.3 = icmp ule i32 %zext.3, %sext.3
+
+  %and.0 = and i1 %cmp.0, %cmp.1
+  %and.1 = and i1 %and.0, %cmp.2
+  %and.2 = and i1 %and.1, %cmp.3
+
+  %zext.4 = zext i1 %and.2 to i32
+
+  ret i32 %zext.4
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll
index 7b4e2b0ce911..1bb87bf6205f 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-root-trunc.ll
@@ -7,9 +7,9 @@ define void @test(ptr %a, i8 %0, i16 %b.promoted.i) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP0]] to i128
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[B_PROMOTED_I]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i128> poison, i128 [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i128> [[TMP5]], <4 x i128> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc <4 x i128> [[TMP6]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP2]] to i16
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i16> poison, i16 [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP4]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> [[TMP8]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext i16 [[TMP9]] to i64
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
new file mode 100644
index 000000000000..d80d7b5ecd4e
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=x86-64-v4 < %s | FileCheck %s
+
+%struct.rect = type { float, float, float, float }
+
+define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ptr [[I7:%.*]], i32 [[TMP0:%.*]], i1 [[TOBOOL62_NOT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RC21:%.*]] = alloca [0 x [0 x %struct.rect]], i32 0, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[RC21]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[X1:%.*]] = getelementptr i8, ptr [[RC21]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, ptr [[X1]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[I7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = fcmp olt <4 x float> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP5]], <4 x float> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[TMP12]], <4 x float> zeroinitializer, <4 x float> [[TMP15]]
+; CHECK-NEXT:    store <4 x float> [[TMP16]], ptr [[RC21]], align 4
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       entry.if.end72_crit_edge:
+; CHECK-NEXT:    br label [[IF_END72:%.*]]
+; CHECK:       if.then63:
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x float> [ poison, [[IF_THEN63:%.*]] ], [ [[TMP16]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[TMP17]])
+; CHECK-NEXT:    [[TMP19:%.*]] = fptosi <4 x float> [[TMP18]] to <4 x i32>
+; CHECK-NEXT:    br label [[IF_END72]]
+; CHECK:       if.end72:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi <4 x i32> [ poison, [[ENTRY_IF_END72_CRIT_EDGE:%.*]] ], [ [[TMP19]], [[IF_END]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    br i1 [[TOBOOL62_NOT]], label [[IF_END75:%.*]], label [[IF_THEN74:%.*]]
+; CHECK:       if.then74:
+; CHECK-NEXT:    br label [[IF_END75]]
+; CHECK:       if.end75:
+; CHECK-NEXT:    [[TMP22:%.*]] = phi <4 x i32> [ [[TMP20]], [[IF_THEN74]] ], [ [[TMP21]], [[IF_END72]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <4 x i32> [[TMP22]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = mul <4 x i32> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = sitofp <4 x i32> [[TMP25]] to <4 x float>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x float> [[TMP26]], <4 x float> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    store <4 x float> [[TMP27]], ptr [[RC21]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %rc21 = alloca [0 x [0 x %struct.rect]], i32 0, align 4
+  %1 = load float, ptr %rc21, align 4
+  %cmp = fcmp olt float %1, 0.000000e+00
+  %conv = sitofp i32 %0 to float
+  %cmp2 = fcmp olt float %conv, 0.000000e+00
+  %cond = select i1 %cmp2, float %conv, float 0.000000e+00
+  %cond9 = select i1 %cmp, float 0.000000e+00, float %cond
+  store float %cond9, ptr %rc21, align 4
+  %x1 = getelementptr i8, ptr %rc21, i64 4
+  %2 = load float, ptr %x1, align 4
+  %cmp11 = fcmp olt float %2, 0.000000e+00
+  %conv16 = sitofp i32 %0 to float
+  %cmp17 = fcmp olt float %conv16, 0.000000e+00
+  %cond24 = select i1 %cmp17, float %conv16, float 0.000000e+00
+  %cond26 = select i1 %cmp11, float 0.000000e+00, float %cond24
+  store float %cond26, ptr %x1, align 4
+  %y0 = getelementptr i8, ptr %rc21, i64 8
+  %3 = load float, ptr %y0, align 4
+  %cmp28 = fcmp olt float %3, 0.000000e+00
+  %cmp34 = fcmp olt float %conv, 0.000000e+00
+  %cond41 = select i1 %cmp34, float %conv, float 0.000000e+00
+  %cond43 = select i1 %cmp28, float 0.000000e+00, float %cond41
+  store float %cond43, ptr %y0, align 4
+  %y11 = getelementptr i8, ptr %rc21, i64 12
+  %4 = load float, ptr %i7, align 4
+  %cmp45 = fcmp olt float %4, 0.000000e+00
+  %cmp51 = fcmp olt float %conv16, 0.000000e+00
+  %cond58 = select i1 %cmp51, float %conv16, float 0.000000e+00
+  %cond60 = select i1 %cmp45, float 0.000000e+00, float %cond58
+  store float %cond60, ptr %y11, align 4
+  br label %if.end
+
+entry.if.end72_crit_edge:
+  br label %if.end72
+
+if.then63:
+  br label %if.end
+
+if.end:
+  %5 = phi float [ 0.000000e+00, %if.then63 ], [ %cond60, %entry ]
+  %6 = phi float [ 0.000000e+00, %if.then63 ], [ %cond26, %entry ]
+  %7 = phi float [ 0.000000e+00, %if.then63 ], [ %cond43, %entry ]
+  %8 = phi float [ 0.000000e+00, %if.then63 ], [ %cond9, %entry ]
+  %9 = call float @llvm.round.f32(float %8)
+  %conv65 = fptosi float %9 to i32
+  %10 = call float @llvm.round.f32(float %7)
+  %conv67 = fptosi float %10 to i32
+  %11 = call float @llvm.round.f32(float %6)
+  %conv69 = fptosi float %11 to i32
+  %12 = call float @llvm.round.f32(float %5)
+  %conv71 = fptosi float %12 to i32
+  br label %if.end72
+
+if.end72:
+  %.pre100 = phi i32 [ 0, %entry.if.end72_crit_edge ], [ %conv71, %if.end ]
+  %.pre99 = phi i32 [ 0, %entry.if.end72_crit_edge ], [ %conv67, %if.end ]
+  %.pre98 = phi i32 [ 0, %entry.if.end72_crit_edge ], [ %conv69, %if.end ]
+  %.pre97 = phi i32 [ 0, %entry.if.end72_crit_edge ], [ %conv65, %if.end ]
+  br i1 %tobool62.not, label %if.end75, label %if.then74
+
+if.then74:
+  br label %if.end75
+
+if.end75:
+  %13 = phi i32 [ %.pre99, %if.then74 ], [ %.pre100, %if.end72 ]
+  %14 = phi i32 [ %.pre100, %if.then74 ], [ %.pre99, %if.end72 ]
+  %15 = phi i32 [ %.pre97, %if.then74 ], [ %.pre98, %if.end72 ]
+  %16 = phi i32 [ %.pre98, %if.then74 ], [ %.pre97, %if.end72 ]
+  %sub = or i32 %16, 1
+  %mul = mul i32 %sub, %0
+  %conv77 = sitofp i32 %mul to float
+  store float %conv77, ptr %rc21, align 4
+  %x178 = getelementptr i8, ptr %rc21, i64 4
+  %sub79 = or i32 %15, 1
+  %mul80 = mul i32 %sub79, %0
+  %conv81 = sitofp i32 %mul80 to float
+  store float %conv81, ptr %x178, align 4
+  %y082 = getelementptr i8, ptr %rc21, i64 8
+  %sub83 = or i32 %14, 1
+  %mul84 = mul i32 %sub83, %0
+  %conv85 = sitofp i32 %mul84 to float
+  store float %conv85, ptr %y082, align 4
+  %y186 = getelementptr i8, ptr %rc21, i64 12
+  %sub87 = or i32 %13, 1
+  %mul88 = mul i32 %sub87, %0
+  %conv89 = sitofp i32 %mul88 to float
+  store float %conv89, ptr %y186, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-with-extractelement-many-users.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-with-extractelement-many-users.ll
new file mode 100644
index 000000000000..37d166953c33
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-with-extractelement-many-users.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-99999 < %s | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x ptr> zeroinitializer, i32 0
+; CHECK-NEXT:    [[GETELEMENTPTR6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 872
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[GETELEMENTPTR6]], align 8
+; CHECK-NEXT:    br label [[BB9:%.*]]
+; CHECK:       bb9:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x ptr> [ getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> <i64 32, i64 872>), [[BB:%.*]] ]
+; CHECK-NEXT:    ret void
+;
+bb:
+  %getelementptr = getelementptr i8, ptr null, i64 32
+  %0 = extractelement <2 x ptr> zeroinitializer, i32 0
+  %getelementptr6 = getelementptr i8, ptr %0, i64 872
+  store double 0.000000e+00, ptr %getelementptr6, align 8
+  br label %bb9
+
+bb9:
+  %phi10 = phi ptr [ %getelementptr, %bb ]
+  %phi11 = phi ptr [ %getelementptr6, %bb ]
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
index 668d3c3c8c82..0ab56279fe47 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
@@ -16,8 +16,7 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i8> [[TMP8]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i1> zeroinitializer, [[TMP15]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <4 x i1> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i1> [[TMP15]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP13]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
index 75505f632a43..29021150ccd2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2   | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx    | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2   | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2   -slp-threshold=-1 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx    | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2   | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s
 
 define void @store_i32(ptr nocapture %0, i32 %1, i32 %2) {
 ; CHECK-LABEL: @store_i32(
@@ -98,58 +98,19 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) {
 }
 
 define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
-; SSE-LABEL: @store_i64(
-; SSE-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
-; SSE-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
-; SSE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = lshr i64 [[TMP6]], 15
-; SSE-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
-; SSE-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255
-; SSE-NEXT:    [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295
-; SSE-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255
-; SSE-NEXT:    store i64 [[TMP11]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
-; SSE-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]]
-; SSE-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 15
-; SSE-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
-; SSE-NEXT:    [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255
-; SSE-NEXT:    [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295
-; SSE-NEXT:    [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255
-; SSE-NEXT:    store i64 [[TMP19]], ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
-; SSE-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]]
-; SSE-NEXT:    [[TMP23:%.*]] = lshr i64 [[TMP22]], 15
-; SSE-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
-; SSE-NEXT:    [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255
-; SSE-NEXT:    [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295
-; SSE-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255
-; SSE-NEXT:    store i64 [[TMP27]], ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
-; SSE-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]]
-; SSE-NEXT:    [[TMP31:%.*]] = lshr i64 [[TMP30]], 15
-; SSE-NEXT:    [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
-; SSE-NEXT:    [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255
-; SSE-NEXT:    [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295
-; SSE-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255
-; SSE-NEXT:    store i64 [[TMP35]], ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    ret void
-;
-; AVX-LABEL: @store_i64(
-; AVX-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
-; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
-; AVX-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
-; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
-; AVX-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], <i64 15, i64 15, i64 15, i64 15>
-; AVX-NEXT:    [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
-; AVX-NEXT:    [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], <i32 255, i32 255, i32 255, i32 255>
-; AVX-NEXT:    [[TMP12:%.*]] = and <4 x i64> [[TMP9]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
-; AVX-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> <i64 255, i64 255, i64 255, i64 255>
-; AVX-NEXT:    store <4 x i64> [[TMP13]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
-; AVX-NEXT:    ret void
+; CHECK-LABEL: @store_i64(
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], <i64 15, i64 15, i64 15, i64 15>
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], <i32 255, i32 255, i32 255, i32 255>
+; CHECK-NEXT:    [[TMP12:%.*]] = and <4 x i64> [[TMP9]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> <i64 255, i64 255, i64 255, i64 255>
+; CHECK-NEXT:    store <4 x i64> [[TMP13]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
+; CHECK-NEXT:    ret void
 ;
   %4 = zext i32 %1 to i64
   %5 = load i64, ptr %0, align 8, !tbaa !7
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll
index 867a49dbaed2..7258ffca1278 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll
@@ -1,5 +1,9 @@
-; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-profile.prof -pass-remarks=sample-profile -pass-remarks-output=%t.opt.yaml -sample-profile-use-profi=0 -S | FileCheck %s
-; RUN: FileCheck %s -check-prefix=YAML < %t.opt.yaml
+; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-profile.prof -pass-remarks=sample-profile -pass-remarks-output=%t.opt.yaml -sample-profile-use-profi=0 -S -o %t
+; RUN: FileCheck %s --input-file %t
+; RUN: FileCheck %s -check-prefix=YAML --input-file %t.opt.yaml
+; RUN: opt < %t -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-profile.prof -sample-profile-remove-probe -S | FileCheck %s -check-prefix=REMOVE-PROBE
+
+; REMOVE-PROBE-NOT: call void @llvm.pseudoprobe
 
 define dso_local i32 @foo(i32 %x, ptr %f) #0 !dbg !4 {
 entry:
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/endless-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/endless-unswitch.ll
index 0d3aa8b24310..e70bea2d2f7a 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/endless-unswitch.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/endless-unswitch.ll
@@ -106,3 +106,96 @@ for.inc:                                          ; preds = %for.cond5
   store i8 0, ptr @b, align 1
   br label %for.cond5
 }
+
+define void @e(ptr %p) {
+; CHECK-LABEL: @e(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[FOR_COND]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i16 [[TMP0]] to i1
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FOR_END_SPLIT:%.*]], label [[FOR_END_SPLIT_US:%.*]]
+; CHECK:       for.end.split.us:
+; CHECK-NEXT:    br label [[G_US:%.*]]
+; CHECK:       g.us:
+; CHECK-NEXT:    br label [[G_SPLIT_US6:%.*]]
+; CHECK:       for.cond1.us1:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[P]], align 2
+; CHECK-NEXT:    [[TOBOOL4_NOT_US:%.*]] = trunc i16 [[TMP2]] to i1
+; CHECK-NEXT:    br i1 [[TOBOOL4_NOT_US]], label [[FOR_COND5_PREHEADER_US4:%.*]], label [[G_LOOPEXIT_US:%.*]]
+; CHECK:       for.cond5.us2:
+; CHECK-NEXT:    br i1 false, label [[FOR_COND1_LOOPEXIT_US5:%.*]], label [[FOR_INC_US3:%.*]]
+; CHECK:       for.inc.us3:
+; CHECK-NEXT:    store i8 0, ptr @b, align 1
+; CHECK-NEXT:    br label [[FOR_COND5_US2:%.*]]
+; CHECK:       for.cond5.preheader.us4:
+; CHECK-NEXT:    br label [[FOR_COND5_US2]]
+; CHECK:       for.cond1.loopexit.us5:
+; CHECK-NEXT:    br label [[FOR_COND1_US1:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       g.loopexit.us:
+; CHECK-NEXT:    br label [[G_US]]
+; CHECK:       g.split.us6:
+; CHECK-NEXT:    br label [[FOR_COND1_US1]]
+; CHECK:       for.end.split:
+; CHECK-NEXT:    br label [[G:%.*]]
+; CHECK:       g.loopexit:
+; CHECK-NEXT:    br label [[G]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       g:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[P]], align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i16 [[TMP3]] to i1
+; CHECK-NEXT:    br i1 [[TMP4]], label [[G_SPLIT_US:%.*]], label [[G_SPLIT:%.*]]
+; CHECK:       g.split.us:
+; CHECK-NEXT:    br label [[FOR_COND1_US:%.*]]
+; CHECK:       for.cond1.us:
+; CHECK-NEXT:    br label [[FOR_COND5_PREHEADER_US:%.*]]
+; CHECK:       for.cond5.us:
+; CHECK-NEXT:    br i1 false, label [[FOR_COND1_LOOPEXIT_US:%.*]], label [[FOR_INC_US:%.*]]
+; CHECK:       for.inc.us:
+; CHECK-NEXT:    store i8 0, ptr @b, align 1
+; CHECK-NEXT:    br label [[FOR_COND5_US:%.*]]
+; CHECK:       for.cond5.preheader.us:
+; CHECK-NEXT:    br label [[FOR_COND5_US]]
+; CHECK:       for.cond1.loopexit.us:
+; CHECK-NEXT:    br label [[FOR_COND1_US]]
+; CHECK:       g.split:
+; CHECK-NEXT:    br label [[FOR_COND1:%.*]]
+; CHECK:       for.cond1.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND1]], !llvm.loop [[LOOP3]]
+; CHECK:       for.cond1:
+; CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[P]], align 2
+; CHECK-NEXT:    [[TOBOOL4_NOT:%.*]] = trunc i16 [[TMP5]] to i1
+; CHECK-NEXT:    br i1 [[TOBOOL4_NOT]], label [[FOR_COND5_PREHEADER:%.*]], label [[G_LOOPEXIT:%.*]]
+; CHECK:       for.cond5.preheader:
+; CHECK-NEXT:    br label [[FOR_COND5:%.*]]
+; CHECK:       for.cond5:
+; CHECK-NEXT:    br i1 false, label [[FOR_COND1_LOOPEXIT:%.*]], label [[FOR_INC:%.*]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    store i8 0, ptr @b, align 1
+; CHECK-NEXT:    br label [[FOR_COND5]]
+;
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond, %entry
+  br i1 false, label %for.end, label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  br label %g
+
+g:                                                ; preds = %for.cond1, %for.end
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.cond5, %g
+  %0 = load i16, ptr %p, align 2
+  %tobool4.not = trunc i16 %0 to i1
+  br i1 %tobool4.not, label %for.cond5, label %g
+
+for.cond5:                                        ; preds = %for.inc, %for.cond1
+  br i1 false, label %for.cond1, label %for.inc
+
+for.inc:                                          ; preds = %for.cond5
+  store i8 0, ptr @b, align 1
+  br label %for.cond5
+}
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll
index f97e5c3eec9d..1d8942079ffd 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll
@@ -1326,6 +1326,136 @@ exit:
   ret i32 10
 }
 
+define i32 @partial_unswitch_true_successor_trunc(ptr %ptr, i32 %N) {
+; CHECK-LABEL: @partial_unswitch_true_successor_trunc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[TMP0]] to i1
+; CHECK-NEXT:    br i1 [[TMP1]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br label [[LOOP_HEADER_US:%.*]]
+; CHECK:       loop.header.us:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[LOOP_LATCH_US:%.*]] ]
+; CHECK-NEXT:    br label [[NOCLOBBER_US:%.*]]
+; CHECK:       noclobber.us:
+; CHECK-NEXT:    br label [[LOOP_LATCH_US]]
+; CHECK:       loop.latch.us:
+; CHECK-NEXT:    [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]]
+; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:    br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK:       exit.split.us:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[SC:%.*]] = trunc i32 [[LV]] to i1
+; CHECK-NEXT:    br i1 [[SC]], label [[NOCLOBBER:%.*]], label [[CLOBBER:%.*]]
+; CHECK:       noclobber:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       clobber:
+; CHECK-NEXT:    call void @clobber()
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       exit.split:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 10
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %lv = load i32, ptr %ptr
+  %sc = trunc i32 %lv to i1
+  br i1 %sc, label %noclobber, label %clobber
+
+noclobber:
+  br label %loop.latch
+
+clobber:
+  call void @clobber()
+  br label %loop.latch
+
+loop.latch:
+  %c = icmp ult i32 %iv, %N
+  %iv.next = add i32 %iv, 1
+  br i1 %c, label %loop.header, label %exit
+
+exit:
+  ret i32 10
+}
+
+define i32 @partial_unswitch_false_successor_trunc(ptr %ptr, i32 %N) {
+; CHECK-LABEL: @partial_unswitch_false_successor_trunc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[TMP0]] to i1
+; CHECK-NEXT:    br i1 [[TMP1]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_US:%.*]]
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br label [[LOOP_HEADER_US:%.*]]
+; CHECK:       loop.header.us:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[LOOP_LATCH_US:%.*]] ]
+; CHECK-NEXT:    br label [[NOCLOBBER_US:%.*]]
+; CHECK:       noclobber.us:
+; CHECK-NEXT:    br label [[LOOP_LATCH_US]]
+; CHECK:       loop.latch.us:
+; CHECK-NEXT:    [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]]
+; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:    br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK:       exit.split.us:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[SC:%.*]] = trunc i32 [[LV]] to i1
+; CHECK-NEXT:    br i1 [[SC]], label [[CLOBBER:%.*]], label [[NOCLOBBER:%.*]]
+; CHECK:       clobber:
+; CHECK-NEXT:    call void @clobber()
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       noclobber:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       exit.split:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 10
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %lv = load i32, ptr %ptr
+  %sc = trunc i32 %lv to i1
+  br i1 %sc, label %clobber, label %noclobber
+
+clobber:
+  call void @clobber()
+  br label %loop.latch
+
+noclobber:
+  br label %loop.latch
+
+loop.latch:
+  %c = icmp ult i32 %iv, %N
+  %iv.next = add i32 %iv, 1
+  br i1 %c, label %loop.header, label %exit
+
+exit:
+  ret i32 10
+}
+
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[UNSWITCH_PARTIAL_DISABLE:![0-9]+]]}
 ; CHECK: [[UNSWITCH_PARTIAL_DISABLE]] = !{!"llvm.loop.unswitch.partial.disable"}
 ; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[UNSWITCH_PARTIAL_DISABLE]]}
diff --git a/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue-inlined.ll b/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue-inlined.ll
index e00d1daf71de..5af73e789f11 100644
--- a/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue-inlined.ll
+++ b/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue-inlined.ll
@@ -9,7 +9,6 @@ init:
 
 ; CHECK:  %vala = load i64, ptr %ptr
 ; CHECK-NEXT:  call void @llvm.dbg.value(metadata i64 %vala, metadata [[MD:![0-9]*]]
-; CHECK-NEXT:  call void @llvm.dbg.value(metadata i64 %vala, metadata [[MD]]
 ; CHECK-NEXT:  %valbmasked = and i64 %vala, 1
 
 a:                                              ; preds = %init
diff --git a/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll b/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll
index af7da45ec089..c5d723c4e3dd 100644
--- a/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll
+++ b/llvm/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll
@@ -47,7 +47,6 @@ define i1 @hoist_with_debug2(i32 %x) !dbg !22 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ugt i32 [[X:%.*]], 2
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[X]], metadata [[META21:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[X]], metadata [[META21]], metadata !DIExpression()), !dbg [[DBG23]]
 ; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[TOBOOL_NOT]], i1 false, i1 true
 ; CHECK-NEXT:    ret i1 [[DOT]]
 ;
diff --git a/llvm/test/Verifier/invalid-splice.ll b/llvm/test/Verifier/invalid-splice.ll
index d5096bdf17ca..2239386df562 100644
--- a/llvm/test/Verifier/invalid-splice.ll
+++ b/llvm/test/Verifier/invalid-splice.ll
@@ -2,36 +2,36 @@
 
 ; CHECK: The splice index exceeds the range [-VL, VL-1] where VL is the known minimum number of elements in the vector
 define <2 x double> @splice_v2f64_idx_neg3(<2 x double> %a, <2 x double> %b) #0 {
-  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 -3)
+  %res = call <2 x double> @llvm.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 -3)
   ret <2 x double> %res
 }
 
 ; CHECK: The splice index exceeds the range [-VL, VL-1] where VL is the known minimum number of elements in the vector
 define <vscale x 2 x double> @splice_nxv2f64_idx_neg3_vscale_min1(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -3)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -3)
   ret <vscale x 2 x double> %res
 }
 
 ; CHECK: The splice index exceeds the range [-VL, VL-1] where VL is the known minimum number of elements in the vector
 define <vscale x 2 x double> @splice_nxv2f64_idx_neg5_vscale_min2(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #1 {
-  %res = call <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -5)
+  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -5)
   ret <vscale x 2 x double> %res
 }
 
 ; CHECK: The splice index exceeds the range [-VL, VL-1] where VL is the known minimum number of elements in the vector
 define <2 x double> @splice_v2f64_idx2(<2 x double> %a, <2 x double> %b) #0 {
-  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 2)
+  %res = call <2 x double> @llvm.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 2)
   ret <2 x double> %res
 }
 
 ; CHECK: The splice index exceeds the range [-VL, VL-1] where VL is the known minimum number of elements in the vector
 define <2 x double> @splice_v2f64_idx3(<2 x double> %a, <2 x double> %b) #1 {
-  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 4)
+  %res = call <2 x double> @llvm.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 4)
   ret <2 x double> %res
 }
 
 attributes #0 = { vscale_range(1,16) }
 attributes #1 = { vscale_range(2,16) }
 
-declare <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double>, <2 x double>, i32)
-declare <vscale x 2 x double> @llvm.experimental.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
+declare <2 x double> @llvm.vector.splice.v2f64(<2 x double>, <2 x double>, i32)
+declare <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/skip-unsupported-instructions-none-remain.s b/llvm/test/tools/llvm-mca/X86/BtVer2/skip-unsupported-instructions-none-remain.s
new file mode 100644
index 000000000000..0d67f53e12f1
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/skip-unsupported-instructions-none-remain.s
@@ -0,0 +1,14 @@
+# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -skip-unsupported-instructions %s 2>&1 | FileCheck --check-prefixes=CHECK-ALL,CHECK-SKIP %s
+# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 %s 2>&1 | FileCheck --check-prefixes=CHECK-ALL,CHECK-ERROR %s
+
+# Test defends that if all instructions are skipped leaving an empty input, an error is printed.
+
+bzhi %eax, %ebx, %ecx
+
+# CHECK-ALL-NOT: error
+
+# CHECK-ERROR: error: found an unsupported instruction in the input assembly sequence, use -skip-unsupported-instructions to ignore.
+
+# CHECK-SKIP: warning: found an unsupported instruction in the input assembly sequence, skipping with -skip-unsupported-instructions, note accuracy will be impacted:
+# CHECK-SKIP: note: instruction:      bzhil   %eax, %ebx, %ecx
+# CHECK-SKIP: error: no assembly instructions found.
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/unsupported-instruction.s b/llvm/test/tools/llvm-mca/X86/BtVer2/unsupported-instruction.s
index bb88e951c129..3690a1101be9 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/unsupported-instruction.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/unsupported-instruction.s
@@ -1,6 +1,55 @@
-# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 %s 2>&1 | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -skip-unsupported-instructions -timeline %s 2>&1 | FileCheck --check-prefix=CHECK-SKIP %s
+# RUN: not llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+
+# Test checks that unsupported instructions exit with an error, unless -skip-unsupported-instructions is passed, in which case the remaining instructions should be analysed.
+
+# CHECK-SKIP: warning: found an unsupported instruction in the input assembly sequence, skipping with -skip-unsupported-instructions, note accuracy will be impacted:
+# CHECK-ERROR: error: found an unsupported instruction in the input assembly sequence, use -skip-unsupported-instructions to ignore.
 
 bzhi %eax, %ebx, %ecx
 
-# CHECK: error: found an unsupported instruction in the input assembly sequence.
-# CHECK-NEXT: note: instruction: 	bzhil	%eax, %ebx, %ecx
+# Supported instruction that may be analysed.
+add %eax, %eax
+
+# CHECK-SKIP: Iterations:        100
+# CHECK-SKIP: Instructions:      100
+# CHECK-SKIP: Total Cycles:      103
+# CHECK-SKIP: Total uOps:        100
+
+# CHECK-SKIP: Dispatch Width:    2
+# CHECK-SKIP: uOps Per Cycle:    0.97
+# CHECK-SKIP: IPC:               0.97
+# CHECK-SKIP: Block RThroughput: 0.5
+
+# CHECK-SKIP: Instruction Info:
+# CHECK-SKIP: [1]: #uOps
+# CHECK-SKIP: [2]: Latency
+# CHECK-SKIP: [3]: RThroughput
+# CHECK-SKIP: [4]: MayLoad
+# CHECK-SKIP: [5]: MayStore
+# CHECK-SKIP: [6]: HasSideEffects (U)
+
+# CHECK-SKIP: [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-SKIP:  1      1     0.50                        addl  %eax, %eax
+
+# CHECK-SKIP: Timeline view:
+
+# CHECK-SKIP: [0,0]     DeER .    . .   addl  %eax, %eax
+# CHECK-SKIP: [1,0]     D=eER.    . .   addl  %eax, %eax
+# CHECK-SKIP: [2,0]     .D=eER    . .   addl  %eax, %eax
+# CHECK-SKIP: [3,0]     .D==eER   . .   addl  %eax, %eax
+# CHECK-SKIP: [4,0]     . D==eER  . .   addl  %eax, %eax
+# CHECK-SKIP: [5,0]     . D===eER . .   addl  %eax, %eax
+# CHECK-SKIP: [6,0]     .  D===eER. .   addl  %eax, %eax
+# CHECK-SKIP: [7,0]     .  D====eER .   addl  %eax, %eax
+# CHECK-SKIP: [8,0]     .   D====eER.   addl  %eax, %eax
+# CHECK-SKIP: [9,0]     .   D=====eER   addl  %eax, %eax
+
+# CHECK-SKIP: Average Wait times (based on the timeline view):
+# CHECK-SKIP: [0]: Executions
+# CHECK-SKIP: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-SKIP: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-SKIP: [3]: Average time elapsed from WB until retire stage
+
+# CHECK-SKIP:       [0]    [1]    [2]    [3]
+# CHECK-SKIP: 0.     10    3.5    0.1    0.0       addl       %eax, %eax
diff --git a/llvm/test/tools/llvm-rc/Inputs/dialog-with-menu.rc b/llvm/test/tools/llvm-rc/Inputs/dialog-with-menu.rc
new file mode 100644
index 000000000000..bb79dca399c2
--- /dev/null
+++ b/llvm/test/tools/llvm-rc/Inputs/dialog-with-menu.rc
@@ -0,0 +1,16 @@
+101 DIALOG 0, 0, 362, 246
+STYLE 0x40l | 0x0004l | 0x0008l | 0x0800l | 0x00020000l |
+    0x00010000l | 0x80000000l | 0x10000000l | 0x02000000l | 0x00C00000l |
+    0x00080000l | 0x00040000l
+CAPTION "MakeNSISW"
+MENU 104
+FONT 8, "MS Shell Dlg"
+BEGIN
+    CONTROL "",202,"RichEdit20A",0x0004l | 0x0040l |
+                    0x0100l | 0x0800l | 0x00008000 |
+                    0x00010000l | 0x00800000l | 0x00200000l,7,22,348,190
+    CONTROL "",-1,"Static",0x00000010l,7,220,346,1
+    LTEXT "",200,7,230,200,12,0x08000000l
+    DEFPUSHBUTTON "Test &Installer",203,230,226,60,15,0x08000000l | 0x00010000l
+    PUSHBUTTON "&Close",2,296,226,49,15,0x00010000l
+END
diff --git a/llvm/test/tools/llvm-rc/dialog-with-menu.test b/llvm/test/tools/llvm-rc/dialog-with-menu.test
new file mode 100644
index 000000000000..2529e9c1722b
--- /dev/null
+++ b/llvm/test/tools/llvm-rc/dialog-with-menu.test
@@ -0,0 +1,32 @@
+; RUN: llvm-rc -no-preprocess /FO %t -- %p/Inputs/dialog-with-menu.rc
+; RUN: llvm-readobj %t | FileCheck %s
+
+CHECK: Resource type (int): DIALOG (ID 5)
+CHECK-NEXT: Resource name (int): 101
+CHECK-NEXT: Data version: 0
+CHECK-NEXT: Memory flags: 0x1030
+CHECK-NEXT: Language ID: 1033
+CHECK-NEXT: Version (major): 0
+CHECK-NEXT: Version (minor): 0
+CHECK-NEXT: Characteristics: 0
+CHECK-NEXT: Data size: 278
+CHECK-NEXT: Data: (
+CHECK-NEXT:   0000: 4C08CF92 00000000 05000000 00006A01  |L.............j.|
+CHECK-NEXT:   0010: F600FFFF 68000000 4D006100 6B006500  |....h...M.a.k.e.|
+CHECK-NEXT:   0020: 4E005300 49005300 57000000 08004D00  |N.S.I.S.W.....M.|
+CHECK-NEXT:   0030: 53002000 53006800 65006C00 6C002000  |S. .S.h.e.l.l. .|
+CHECK-NEXT:   0040: 44006C00 67000000 4489A150 00000000  |D.l.g...D..P....|
+CHECK-NEXT:   0050: 07001600 5C01BE00 CA005200 69006300  |....\.....R.i.c.|
+CHECK-NEXT:   0060: 68004500 64006900 74003200 30004100  |h.E.d.i.t.2.0.A.|
+CHECK-NEXT:   0070: 00000000 00000000 10000050 00000000  |...........P....|
+CHECK-NEXT:   0080: 0700DC00 5A010100 FFFF5300 74006100  |....Z.....S.t.a.|
+CHECK-NEXT:   0090: 74006900 63000000 00000000 00000258  |t.i.c..........X|
+CHECK-NEXT:   00A0: 00000000 0700E600 C8000C00 C800FFFF  |................|
+CHECK-NEXT:   00B0: 82000000 00000000 01000158 00000000  |...........X....|
+CHECK-NEXT:   00C0: E600E200 3C000F00 CB00FFFF 80005400  |....<.........T.|
+CHECK-NEXT:   00D0: 65007300 74002000 26004900 6E007300  |e.s.t. .&.I.n.s.|
+CHECK-NEXT:   00E0: 74006100 6C006C00 65007200 00000000  |t.a.l.l.e.r.....|
+CHECK-NEXT:   00F0: 00000150 00000000 2801E200 31000F00  |...P....(...1...|
+CHECK-NEXT:   0100: 0200FFFF 80002600 43006C00 6F007300  |......&.C.l.o.s.|
+CHECK-NEXT:   0110: 65000000 0000                        |e.....|
+CHECK-NEXT: )
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index 498308e2edbe..ed53f8fabb17 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -18,6 +18,7 @@
 #include "PerfHelper.h"
 #include "SubprocessMemory.h"
 #include "Target.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -283,6 +284,7 @@ private:
                    SmallVectorImpl<int64_t> &CounterValues,
                    ArrayRef<const char *> ValidationCounters,
                    SmallVectorImpl<int64_t> &ValidationCounterValues) const {
+    auto WriteFDClose = make_scope_exit([WriteFD]() { close(WriteFD); });
     const ExegesisTarget &ET = State.getExegesisTarget();
     auto CounterOrError =
         ET.createCounter(CounterName, State, ValidationCounters, ChildPID);
diff --git a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
index 1d44e09ad61e..89d7b197079e 100644
--- a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
@@ -8,6 +8,7 @@
 
 #include "SubprocessMemory.h"
 #include "Error.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <cerrno>
@@ -56,6 +57,8 @@ Error SubprocessMemory::initializeSubprocessMemory(pid_t ProcessID) {
     return make_error<Failure>(
         "Failed to create shared memory object for auxiliary memory: " +
         Twine(strerror(errno)));
+  auto AuxiliaryMemoryFDClose =
+      make_scope_exit([AuxiliaryMemoryFD]() { close(AuxiliaryMemoryFD); });
   if (ftruncate(AuxiliaryMemoryFD, AuxiliaryMemorySize) != 0) {
     return make_error<Failure>("Truncating the auxiliary memory failed: " +
                                Twine(strerror(errno)));
@@ -78,6 +81,8 @@ Error SubprocessMemory::addMemoryDefinition(
       return make_error<Failure>(
           "Failed to create shared memory object for memory definition: " +
           Twine(strerror(errno)));
+    auto SharedMemoryFDClose =
+        make_scope_exit([SharedMemoryFD]() { close(SharedMemoryFD); });
     if (ftruncate(SharedMemoryFD, MemVal.SizeBytes) != 0) {
       return make_error<Failure>("Truncating a memory definiton failed: " +
                                  Twine(strerror(errno)));
@@ -138,7 +143,7 @@ Expected<int> SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
 }
 
 SubprocessMemory::~SubprocessMemory() {
-  for (std::string SharedMemoryName : SharedMemoryNames) {
+  for (const std::string &SharedMemoryName : SharedMemoryNames) {
     if (shm_unlink(SharedMemoryName.c_str()) != 0) {
       errs() << "Failed to unlink shared memory section: " << strerror(errno)
              << "\n";
diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp
index faed9ff9939b..5dd961a603c9 100644
--- a/llvm/tools/llvm-lto2/llvm-lto2.cpp
+++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp
@@ -251,10 +251,9 @@ static int run(int argc, char **argv) {
   // resolutions and apply them in the order observed.
   std::map<std::pair<std::string, std::string>, std::list<SymbolResolution>>
       CommandLineResolutions;
-  for (std::string R : SymbolResolutions) {
-    StringRef Rest = R;
-    StringRef FileName, SymbolName;
-    std::tie(FileName, Rest) = Rest.split(',');
+  for (StringRef R : SymbolResolutions) {
+    StringRef Rest, FileName, SymbolName;
+    std::tie(FileName, Rest) = R.split(',');
     if (Rest.empty()) {
       llvm::errs() << "invalid resolution: " << R << '\n';
       return 1;
diff --git a/llvm/tools/llvm-mca/CodeRegion.h b/llvm/tools/llvm-mca/CodeRegion.h
index ce107fd8f3b6..5a2e8baa1f3e 100644
--- a/llvm/tools/llvm-mca/CodeRegion.h
+++ b/llvm/tools/llvm-mca/CodeRegion.h
@@ -59,6 +59,7 @@
 #define LLVM_TOOLS_LLVM_MCA_CODEREGION_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
@@ -97,6 +98,20 @@ public:
     Instructions.emplace_back(Instruction);
   }
 
+  // Remove the given instructions from the set, for unsupported instructions
+  // being skipped. Returns an ArrayRef for the updated vector of Instructions.
+  [[nodiscard]] llvm::ArrayRef<llvm::MCInst>
+  dropInstructions(const llvm::SmallPtrSetImpl<const llvm::MCInst *> &Insts) {
+    if (Insts.empty())
+      return Instructions;
+    Instructions.erase(std::remove_if(Instructions.begin(), Instructions.end(),
+                                      [&Insts](const llvm::MCInst &Inst) {
+                                        return Insts.contains(&Inst);
+                                      }),
+                       Instructions.end());
+    return Instructions;
+  }
+
   llvm::SMLoc startLoc() const { return RangeStart; }
   llvm::SMLoc endLoc() const { return RangeEnd; }
 
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
index eb71cffba6dd..e037c06b12a3 100644
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -237,6 +237,11 @@ static cl::opt<bool> DisableInstrumentManager(
              "ignores instruments.)."),
     cl::cat(ViewOptions), cl::init(false));
 
+static cl::opt<bool> SkipUnsupportedInstructions(
+    "skip-unsupported-instructions",
+    cl::desc("Make unsupported instruction errors into warnings."),
+    cl::cat(ViewOptions), cl::init(false));
+
 namespace {
 
 const Target *getTarget(const char *ProgName) {
@@ -558,6 +563,7 @@ int main(int argc, char **argv) {
   assert(MAB && "Unable to create asm backend!");
 
   json::Object JSONOutput;
+  int NonEmptyRegions = 0;
   for (const std::unique_ptr<mca::AnalysisRegion> &Region : Regions) {
     // Skip empty code regions.
     if (Region->empty())
@@ -571,14 +577,13 @@ int main(int argc, char **argv) {
 
     IPP->resetState();
 
-    DenseMap<const MCInst *, SmallVector<mca::Instrument *>>
-        InstToInstruments;
+    DenseMap<const MCInst *, SmallVector<mca::Instrument *>> InstToInstruments;
     SmallVector<std::unique_ptr<mca::Instruction>> LoweredSequence;
+    SmallPtrSet<const MCInst *, 16> DroppedInsts;
     for (const MCInst &MCI : Insts) {
       SMLoc Loc = MCI.getLoc();
       const SmallVector<mca::Instrument *> Instruments =
           InstrumentRegions.getActiveInstruments(Loc);
-      InstToInstruments.insert({&MCI, Instruments});
 
       Expected<std::unique_ptr<mca::Instruction>> Inst =
           IB.createInstruction(MCI, Instruments);
@@ -588,7 +593,15 @@ int main(int argc, char **argv) {
                 [&IP, &STI](const mca::InstructionError<MCInst> &IE) {
                   std::string InstructionStr;
                   raw_string_ostream SS(InstructionStr);
-                  WithColor::error() << IE.Message << '\n';
+                  if (SkipUnsupportedInstructions)
+                    WithColor::warning()
+                        << IE.Message
+                        << ", skipping with -skip-unsupported-instructions, "
+                           "note accuracy will be impacted:\n";
+                  else
+                    WithColor::error()
+                        << IE.Message
+                        << ", use -skip-unsupported-instructions to ignore.\n";
                   IP->printInst(&IE.Inst, 0, "", *STI, SS);
                   SS.flush();
                   WithColor::note()
@@ -597,14 +610,25 @@ int main(int argc, char **argv) {
           // Default case.
           WithColor::error() << toString(std::move(NewE));
         }
+        if (SkipUnsupportedInstructions) {
+          DroppedInsts.insert(&MCI);
+          continue;
+        }
         return 1;
       }
 
       IPP->postProcessInstruction(Inst.get(), MCI);
-
+      InstToInstruments.insert({&MCI, Instruments});
       LoweredSequence.emplace_back(std::move(Inst.get()));
     }
 
+    Insts = Region->dropInstructions(DroppedInsts);
+
+    // Skip empty regions.
+    if (Insts.empty())
+      continue;
+    NonEmptyRegions++;
+
     mca::CircularSourceMgr S(LoweredSequence,
                              PrintInstructionTables ? 1 : Iterations);
 
@@ -759,6 +783,11 @@ int main(int argc, char **argv) {
     ++RegionIdx;
   }
 
+  if (NonEmptyRegions == 0) {
+    WithColor::error() << "no assembly instructions found.\n";
+    return 1;
+  }
+
   if (PrintJson)
     TOF->os() << formatv("{0:2}", json::Value(std::move(JSONOutput))) << "\n";
 
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 339822e4adcd..675364a1c1bc 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -947,6 +947,55 @@ public:
 };
 AArch64PrettyPrinter AArch64PrettyPrinterInst;
 
+class RISCVPrettyPrinter : public PrettyPrinter {
+public:
+  void printInst(MCInstPrinter &IP, const MCInst *MI, ArrayRef<uint8_t> Bytes,
+                 object::SectionedAddress Address, formatted_raw_ostream &OS,
+                 StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
+                 StringRef ObjectFilename, std::vector<RelocationRef> *Rels,
+                 LiveVariablePrinter &LVP) override {
+    if (SP && (PrintSource || PrintLines))
+      SP->printSourceLine(OS, Address, ObjectFilename, LVP);
+    LVP.printBetweenInsts(OS, false);
+
+    size_t Start = OS.tell();
+    if (LeadingAddr)
+      OS << format("%8" PRIx64 ":", Address.Address);
+    if (ShowRawInsn) {
+      size_t Pos = 0, End = Bytes.size();
+      if (End % 4 == 0) {
+        // 32-bit and 64-bit instructions.
+        for (; Pos + 4 <= End; Pos += 4)
+          OS << ' '
+             << format_hex_no_prefix(
+                    llvm::support::endian::read<uint32_t>(
+                        Bytes.data() + Pos, llvm::endianness::little),
+                    8);
+      } else if (End % 2 == 0) {
+        // 16-bit and 48-bits instructions.
+        for (; Pos + 2 <= End; Pos += 2)
+          OS << ' '
+             << format_hex_no_prefix(
+                    llvm::support::endian::read<uint16_t>(
+                        Bytes.data() + Pos, llvm::endianness::little),
+                    4);
+      }
+      if (Pos < End) {
+        OS << ' ';
+        dumpBytes(Bytes.slice(Pos), OS);
+      }
+    }
+
+    AlignToInstStartColumn(Start, STI, OS);
+
+    if (MI) {
+      IP.printInst(MI, Address.Address, "", STI, OS);
+    } else
+      OS << "\t<unknown>";
+  }
+};
+RISCVPrettyPrinter RISCVPrettyPrinterInst;
+
 PrettyPrinter &selectPrettyPrinter(Triple const &Triple) {
   switch(Triple.getArch()) {
   default:
@@ -967,6 +1016,9 @@ PrettyPrinter &selectPrettyPrinter(Triple const &Triple) {
   case Triple::aarch64_be:
   case Triple::aarch64_32:
     return AArch64PrettyPrinterInst;
+  case Triple::riscv32:
+  case Triple::riscv64:
+    return RISCVPrettyPrinterInst;
   }
 }
 
diff --git a/llvm/tools/llvm-rc/ResourceFileWriter.cpp b/llvm/tools/llvm-rc/ResourceFileWriter.cpp
index d507525970ec..85b59532bb83 100644
--- a/llvm/tools/llvm-rc/ResourceFileWriter.cpp
+++ b/llvm/tools/llvm-rc/ResourceFileWriter.cpp
@@ -550,6 +550,11 @@ Error ResourceFileWriter::visitVersionStmt(const VersionStmt *Stmt) {
   return Error::success();
 }
 
+Error ResourceFileWriter::visitMenuStmt(const MenuStmt *Stmt) {
+  ObjectData.Menu = Stmt->Value;
+  return Error::success();
+}
+
 Error ResourceFileWriter::writeResource(
     const RCResource *Res,
     Error (ResourceFileWriter::*BodyWriter)(const RCResource *)) {
@@ -1132,9 +1137,8 @@ Error ResourceFileWriter::writeDialogBody(const RCResource *Base) {
            ulittle16_t(Res->Height)};
   writeObject(Middle);
 
-  // MENU field. As of now, we don't keep them in the state and can peacefully
-  // think there is no menu attached to the dialog.
-  writeInt<uint16_t>(0);
+  // MENU field.
+  RETURN_IF_ERROR(writeIntOrString(ObjectData.Menu));
 
   // Window CLASS field.
   RETURN_IF_ERROR(writeIntOrString(ObjectData.Class));
diff --git a/llvm/tools/llvm-rc/ResourceFileWriter.h b/llvm/tools/llvm-rc/ResourceFileWriter.h
index 9413a0eecdac..82d3e3b9e9e8 100644
--- a/llvm/tools/llvm-rc/ResourceFileWriter.h
+++ b/llvm/tools/llvm-rc/ResourceFileWriter.h
@@ -16,6 +16,7 @@
 #include "ResourceScriptStmt.h"
 #include "ResourceVisitor.h"
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Endian.h"
 
 namespace llvm {
@@ -68,6 +69,7 @@ public:
   Error visitLanguageStmt(const LanguageResource *) override;
   Error visitStyleStmt(const StyleStmt *) override;
   Error visitVersionStmt(const VersionStmt *) override;
+  Error visitMenuStmt(const MenuStmt *) override;
 
   // Stringtables are output at the end of .res file. We need a separate
   // function to do it.
@@ -92,10 +94,11 @@ public:
     };
     std::optional<FontInfo> Font;
     IntOrString Class;
+    IntOrString Menu;
 
     ObjectInfo()
         : LanguageInfo(0), Characteristics(0), VersionInfo(0),
-          Class(StringRef()) {}
+          Class(StringRef()), Menu(StringRef()) {}
   } ObjectData;
 
   struct StringTableInfo {
diff --git a/llvm/tools/llvm-rc/ResourceScriptParser.cpp b/llvm/tools/llvm-rc/ResourceScriptParser.cpp
index 4f02fa502d24..69798152c1f2 100644
--- a/llvm/tools/llvm-rc/ResourceScriptParser.cpp
+++ b/llvm/tools/llvm-rc/ResourceScriptParser.cpp
@@ -430,6 +430,8 @@ RCParser::parseSingleOptionalStatement(OptStmtType StmtsType) {
       return parseFontStmt(StmtsType);
     if (TypeToken->equals_insensitive("STYLE"))
       return parseStyleStmt();
+    if (TypeToken->equals_insensitive("MENU"))
+      return parseMenuStmt();
   }
 
   return getExpectedError("optional statement type, BEGIN or '{'",
@@ -965,6 +967,11 @@ RCParser::ParseOptionType RCParser::parseExStyleStmt() {
   return std::make_unique<ExStyleStmt>(*Arg);
 }
 
+RCParser::ParseOptionType RCParser::parseMenuStmt() {
+  ASSIGN_OR_RETURN(Arg, readIntOrString());
+  return std::make_unique<MenuStmt>(*Arg);
+}
+
 Error RCParser::getExpectedError(const Twine &Message, bool IsAlreadyRead) {
   return make_error<ParserError>(
       Message, IsAlreadyRead ? std::prev(CurLoc) : CurLoc, End);
diff --git a/llvm/tools/llvm-rc/ResourceScriptParser.h b/llvm/tools/llvm-rc/ResourceScriptParser.h
index 603afd8d73fb..aa7f847187c4 100644
--- a/llvm/tools/llvm-rc/ResourceScriptParser.h
+++ b/llvm/tools/llvm-rc/ResourceScriptParser.h
@@ -176,6 +176,7 @@ private:
   ParseOptionType parseExStyleStmt();
   ParseOptionType parseFontStmt(OptStmtType DialogType);
   ParseOptionType parseStyleStmt();
+  ParseOptionType parseMenuStmt();
 
   // Raises an error. If IsAlreadyRead = false (default), this complains about
   // the token that couldn't be parsed. If the flag is on, this complains about
diff --git a/llvm/tools/llvm-rc/ResourceScriptStmt.cpp b/llvm/tools/llvm-rc/ResourceScriptStmt.cpp
index 62df7999252f..a7f3df0863e7 100644
--- a/llvm/tools/llvm-rc/ResourceScriptStmt.cpp
+++ b/llvm/tools/llvm-rc/ResourceScriptStmt.cpp
@@ -309,5 +309,9 @@ raw_ostream &ExStyleStmt::log(raw_ostream &OS) const {
   return OS << "ExStyle: " << Value << "\n";
 }
 
+raw_ostream &MenuStmt::log(raw_ostream &OS) const {
+  return OS << "Menu: " << Value << "\n";
+}
+
 } // namespace rc
 } // namespace llvm
diff --git a/llvm/tools/llvm-rc/ResourceScriptStmt.h b/llvm/tools/llvm-rc/ResourceScriptStmt.h
index 70e7cec9cb84..05865e582859 100644
--- a/llvm/tools/llvm-rc/ResourceScriptStmt.h
+++ b/llvm/tools/llvm-rc/ResourceScriptStmt.h
@@ -993,6 +993,19 @@ public:
   Error visit(Visitor *V) const override { return V->visitExStyleStmt(this); }
 };
 
+// MENU optional statement.
+//
+// Ref: https://learn.microsoft.com/en-us/windows/win32/menurc/menu-statement
+class MenuStmt : public OptionalStmt {
+public:
+  IntOrString Value;
+
+  MenuStmt(IntOrString NameOrId) : Value(NameOrId) {}
+  raw_ostream &log(raw_ostream &) const override;
+  Twine getResourceTypeName() const override { return "MENU"; }
+  Error visit(Visitor *V) const override { return V->visitMenuStmt(this); }
+};
+
 // CLASS optional statement.
 //
 // Ref: msdn.microsoft.com/en-us/library/windows/desktop/aa380883(v=vs.85).aspx
diff --git a/llvm/tools/llvm-rc/ResourceVisitor.h b/llvm/tools/llvm-rc/ResourceVisitor.h
index a950cd7555ec..a121a0a507c2 100644
--- a/llvm/tools/llvm-rc/ResourceVisitor.h
+++ b/llvm/tools/llvm-rc/ResourceVisitor.h
@@ -28,6 +28,7 @@ class FontStmt;
 class LanguageResource;
 class StyleStmt;
 class VersionStmt;
+class MenuStmt;
 
 class Visitor {
 public:
@@ -52,6 +53,7 @@ public:
   virtual Error visitLanguageStmt(const LanguageResource *) = 0;
   virtual Error visitStyleStmt(const StyleStmt *) = 0;
   virtual Error visitVersionStmt(const VersionStmt *) = 0;
+  virtual Error visitMenuStmt(const MenuStmt *) = 0;
 
   virtual ~Visitor() {}
 };
diff --git a/llvm/unittests/ADT/StringRefTest.cpp b/llvm/unittests/ADT/StringRefTest.cpp
index 8df71e8ad033..fa537e816fc8 100644
--- a/llvm/unittests/ADT/StringRefTest.cpp
+++ b/llvm/unittests/ADT/StringRefTest.cpp
@@ -368,6 +368,8 @@ TEST(StringRefTest, StartsWith) {
   EXPECT_TRUE(Str.starts_with("he"));
   EXPECT_FALSE(Str.starts_with("helloworld"));
   EXPECT_FALSE(Str.starts_with("hi"));
+  EXPECT_TRUE(Str.starts_with('h'));
+  EXPECT_FALSE(Str.starts_with('i'));
 }
 
 TEST(StringRefTest, StartsWithInsensitive) {
@@ -421,6 +423,8 @@ TEST(StringRefTest, EndsWith) {
   EXPECT_FALSE(Str.ends_with("helloworld"));
   EXPECT_FALSE(Str.ends_with("worldhello"));
   EXPECT_FALSE(Str.ends_with("so"));
+  EXPECT_TRUE(Str.ends_with('o'));
+  EXPECT_FALSE(Str.ends_with('p'));
 }
 
 TEST(StringRefTest, EndsWithInsensitive) {
diff --git a/llvm/unittests/BinaryFormat/DwarfTest.cpp b/llvm/unittests/BinaryFormat/DwarfTest.cpp
index 2fff8657939b..684e59fa2785 100644
--- a/llvm/unittests/BinaryFormat/DwarfTest.cpp
+++ b/llvm/unittests/BinaryFormat/DwarfTest.cpp
@@ -204,4 +204,19 @@ TEST(DwarfTest, format_provider) {
   EXPECT_EQ("DW_OP_lit0", formatv("{0}", DW_OP_lit0).str());
   EXPECT_EQ("DW_OP_unknown_ff", formatv("{0}", DW_OP_hi_user).str());
 }
+
+TEST(DwarfTest, lname) {
+  auto roundtrip = [](llvm::dwarf::SourceLanguage sl) {
+    auto name_version = toDW_LNAME(sl);
+    // Ignore ones without a defined mapping.
+    if (sl == DW_LANG_Mips_Assembler || sl == DW_LANG_GOOGLE_RenderScript ||
+        !name_version.has_value())
+      return sl;
+    return dwarf::toDW_LANG(name_version->first, name_version->second)
+        .value_or(sl);
+  };
+#define HANDLE_DW_LANG(ID, NAME, LOWER_BOUND, VERSION, VENDOR)                 \
+  EXPECT_EQ(roundtrip(DW_LANG_##NAME), DW_LANG_##NAME);
+#include "llvm/BinaryFormat/Dwarf.def"
+}
 } // end namespace
diff --git a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
index ef80eed8d180..34a36ba68d7c 100644
--- a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
@@ -745,6 +745,120 @@ TEST_F(AArch64GISelMITest, TestNumSignBitsConstant) {
   EXPECT_EQ(3u, Info.computeNumSignBits(CopyRegNeg32));
 }
 
+TEST_F(AArch64GISelMITest, TestNumSignBitsXOR) {
+  StringRef MIRString = "  %c1:_(s8) = G_CONSTANT i8 1\n"
+                        "  %cn1:_(s8) = G_CONSTANT i8 -1\n"
+                        "  %c127:_(s8) = G_CONSTANT i8 127\n"
+                        "  %c32:_(s8) = G_CONSTANT i8 32\n"
+                        "  %cn32:_(s8) = G_CONSTANT i8 -32\n"
+
+                        "  %xor1:_(s8) = G_XOR %c1, %cn1\n"
+                        "  %Copy1:_(s8) = COPY %xor1\n"
+
+                        "  %xor2:_(s8) = G_XOR %c1, %c32\n"
+                        "  %Copy2:_(s8) = COPY %xor2\n"
+
+                        "  %xor3:_(s8) = G_XOR %c32, %c127\n"
+                        "  %Copy3:_(s8) = COPY %xor3\n"
+
+                        "  %xor4:_(s8) = G_XOR %cn32, %c127\n"
+                        "  %Copy4:_(s8) = COPY %xor4\n"
+
+                        "  %xor5:_(s8) = G_XOR %c127, %cn32\n"
+                        "  %Copy5:_(s8) = COPY %xor5\n";
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+  Register Copy1 = Copies[Copies.size() - 5];
+  Register Copy2 = Copies[Copies.size() - 4];
+  Register Copy3 = Copies[Copies.size() - 3];
+  Register Copy4 = Copies[Copies.size() - 2];
+  Register Copy5 = Copies[Copies.size() - 1];
+
+  GISelKnownBits Info(*MF);
+  EXPECT_EQ(7u, Info.computeNumSignBits(Copy1));
+  EXPECT_EQ(2u, Info.computeNumSignBits(Copy2));
+  EXPECT_EQ(1u, Info.computeNumSignBits(Copy3));
+  EXPECT_EQ(1u, Info.computeNumSignBits(Copy4));
+  EXPECT_EQ(1u, Info.computeNumSignBits(Copy5));
+}
+
+TEST_F(AArch64GISelMITest, TestNumSignBitsOR) {
+  StringRef MIRString = "  %c1:_(s8) = G_CONSTANT i8 1\n"
+                        "  %cn1:_(s8) = G_CONSTANT i8 -1\n"
+                        "  %c127:_(s8) = G_CONSTANT i8 127\n"
+                        "  %c32:_(s8) = G_CONSTANT i8 32\n"
+                        "  %cn32:_(s8) = G_CONSTANT i8 -32\n"
+
+                        "  %or1:_(s8) = G_OR %c1, %cn1\n"
+                        "  %Copy1:_(s8) = COPY %or1\n"
+
+                        "  %or2:_(s8) = G_OR %c1, %c32\n"
+                        "  %Copy2:_(s8) = COPY %or2\n"
+
+                        "  %or3:_(s8) = G_OR %c32, %c127\n"
+                        "  %Copy3:_(s8) = COPY %or3\n"
+
+                        "  %or4:_(s8) = G_OR %cn32, %c127\n"
+                        "  %Copy4:_(s8) = COPY %or4\n"
+
+                        "  %or5:_(s8) = G_OR %c127, %cn32\n"
+                        "  %Copy5:_(s8) = COPY %or5\n";
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+  Register Copy1 = Copies[Copies.size() - 5];
+  Register Copy2 = Copies[Copies.size() - 4];
+  Register Copy3 = Copies[Copies.size() - 3];
+  Register Copy4 = Copies[Copies.size() - 2];
+  Register Copy5 = Copies[Copies.size() - 1];
+
+  GISelKnownBits Info(*MF);
+  EXPECT_EQ(8u, Info.computeNumSignBits(Copy1));
+  EXPECT_EQ(2u, Info.computeNumSignBits(Copy2));
+  EXPECT_EQ(1u, Info.computeNumSignBits(Copy3));
+  EXPECT_EQ(8u, Info.computeNumSignBits(Copy4));
+  EXPECT_EQ(8u, Info.computeNumSignBits(Copy5));
+}
+
+TEST_F(AArch64GISelMITest, TestNumSignBitsAND) {
+  StringRef MIRString = "  %c1:_(s8) = G_CONSTANT i8 1\n"
+                        "  %cn1:_(s8) = G_CONSTANT i8 -1\n"
+                        "  %c127:_(s8) = G_CONSTANT i8 127\n"
+                        "  %c32:_(s8) = G_CONSTANT i8 32\n"
+                        "  %cn32:_(s8) = G_CONSTANT i8 -32\n"
+
+                        "  %and1:_(s8) = G_AND %c1, %cn1\n"
+                        "  %Copy1:_(s8) = COPY %and1\n"
+
+                        "  %and2:_(s8) = G_AND %c1, %c32\n"
+                        "  %Copy2:_(s8) = COPY %and2\n"
+
+                        "  %and3:_(s8) = G_AND %c32, %c127\n"
+                        "  %Copy3:_(s8) = COPY %and3\n"
+
+                        "  %and4:_(s8) = G_AND %cn32, %c127\n"
+                        "  %Copy4:_(s8) = COPY %and4\n"
+
+                        "  %and5:_(s8) = G_AND %c127, %cn32\n"
+                        "  %Copy5:_(s8) = COPY %and5\n";
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+  Register Copy1 = Copies[Copies.size() - 5];
+  Register Copy2 = Copies[Copies.size() - 4];
+  Register Copy3 = Copies[Copies.size() - 3];
+  Register Copy4 = Copies[Copies.size() - 2];
+  Register Copy5 = Copies[Copies.size() - 1];
+
+  GISelKnownBits Info(*MF);
+  EXPECT_EQ(7u, Info.computeNumSignBits(Copy1));
+  EXPECT_EQ(8u, Info.computeNumSignBits(Copy2));
+  EXPECT_EQ(2u, Info.computeNumSignBits(Copy3));
+  EXPECT_EQ(1u, Info.computeNumSignBits(Copy4));
+  EXPECT_EQ(1u, Info.computeNumSignBits(Copy5));
+}
+
 TEST_F(AArch64GISelMITest, TestNumSignBitsSext) {
   StringRef MIRString = "  %3:_(p0) = G_IMPLICIT_DEF\n"
                         "  %4:_(s8) = G_LOAD %3 :: (load (s8))\n"
diff --git a/llvm/unittests/Object/ELFObjectFileTest.cpp b/llvm/unittests/Object/ELFObjectFileTest.cpp
index c4d2b4ae8b9a..c13dc0e3fab8 100644
--- a/llvm/unittests/Object/ELFObjectFileTest.cpp
+++ b/llvm/unittests/Object/ELFObjectFileTest.cpp
@@ -1504,3 +1504,46 @@ Sections:
                "SHT_RELA section with index 1: failed to get a "
                "relocated section: invalid section index: 255");
 }
+
+TEST(ELFObjectFileTest, ELFSymbolRefLess) {
+  SmallString<0> Storage;
+  Expected<ELFObjectFile<ELF64LE>> ElfOrErr = toBinary<ELF64LE>(Storage, R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+)");
+
+  ASSERT_THAT_EXPECTED(ElfOrErr, Succeeded());
+  const ELFObjectFile<ELF64LE> &Obj = *ElfOrErr;
+
+  const uint32_t ValLow = 0x00000001;
+  const uint32_t ValHigh = 0x00000100;
+
+  auto MakeSymbol = [&Obj](size_t SymtabIndex, size_t SymbolIndex) {
+    DataRefImpl Data;
+    Data.d.a = SymtabIndex;
+    Data.d.b = SymbolIndex;
+    SymbolRef Sym(Data, &Obj);
+    return ELFSymbolRef(Sym);
+  };
+
+  ELFSymbolRef ELFSymLowLow = MakeSymbol(ValLow, ValLow);
+  ELFSymbolRef ELFSymLowHigh = MakeSymbol(ValLow, ValHigh);
+  ELFSymbolRef ELFSymHighLow = MakeSymbol(ValHigh, ValLow);
+  ELFSymbolRef ELFSymHighHigh = MakeSymbol(ValHigh, ValHigh);
+
+  EXPECT_TRUE(ELFSymLowLow < ELFSymLowHigh);
+  EXPECT_FALSE(ELFSymLowHigh < ELFSymLowLow);
+  EXPECT_FALSE(ELFSymLowLow < ELFSymLowLow);
+
+  EXPECT_TRUE(ELFSymLowLow < ELFSymHighHigh);
+  EXPECT_TRUE(ELFSymLowHigh < ELFSymHighLow);
+  EXPECT_TRUE(ELFSymLowLow < ELFSymHighLow);
+
+  EXPECT_FALSE(ELFSymHighLow < ELFSymLowHigh);
+  EXPECT_FALSE(ELFSymHighHigh < ELFSymLowLow);
+  EXPECT_FALSE(ELFSymHighLow < ELFSymLowLow);
+}
diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp
index edc427dcbc45..402de64fe99b 100644
--- a/llvm/unittests/ProfileData/InstrProfTest.cpp
+++ b/llvm/unittests/ProfileData/InstrProfTest.cpp
@@ -407,13 +407,13 @@ IndexedMemProfRecord makeRecord(
 IndexedMemProfRecord
 makeRecordV2(std::initializer_list<::llvm::memprof::CallStackId> AllocFrames,
              std::initializer_list<::llvm::memprof::CallStackId> CallSiteFrames,
-             const MemInfoBlock &Block) {
+             const MemInfoBlock &Block, const memprof::MemProfSchema &Schema) {
   llvm::memprof::IndexedMemProfRecord MR;
   for (const auto &CSId : AllocFrames)
     // We don't populate IndexedAllocationInfo::CallStack because we use it only
     // in Version0 and Version1.
     MR.AllocSites.emplace_back(::llvm::SmallVector<memprof::FrameId>(), CSId,
-                               Block);
+                               Block, Schema);
   for (const auto &CSId : CallSiteFrames)
     MR.CallSiteIds.push_back(CSId);
   return MR;
@@ -495,44 +495,6 @@ TEST_F(InstrProfTest, test_memprof_v0) {
   EXPECT_THAT(WantRecord, EqualsRecord(Record));
 }
 
-struct CallStackIdConverter {
-  std::optional<memprof::FrameId> LastUnmappedFrameId;
-  std::optional<memprof::CallStackId> LastUnmappedCSId;
-
-  const FrameIdMapTy &IdToFrameMap;
-  const CallStackIdMapTy &CSIdToCallStackMap;
-
-  CallStackIdConverter() = delete;
-  CallStackIdConverter(const FrameIdMapTy &IdToFrameMap,
-                       const CallStackIdMapTy &CSIdToCallStackMap)
-      : IdToFrameMap(IdToFrameMap), CSIdToCallStackMap(CSIdToCallStackMap) {}
-
-  llvm::SmallVector<memprof::Frame>
-  operator()(::llvm::memprof::CallStackId CSId) {
-    auto IdToFrameCallback = [&](const memprof::FrameId Id) {
-      auto Iter = IdToFrameMap.find(Id);
-      if (Iter == IdToFrameMap.end()) {
-        LastUnmappedFrameId = Id;
-        return memprof::Frame(0, 0, 0, false);
-      }
-      return Iter->second;
-    };
-
-    llvm::SmallVector<memprof::Frame> Frames;
-    auto CSIter = CSIdToCallStackMap.find(CSId);
-    if (CSIter == CSIdToCallStackMap.end()) {
-      LastUnmappedCSId = CSId;
-    } else {
-      const ::llvm::SmallVector<::llvm::memprof::FrameId> &CS =
-          CSIter->getSecond();
-      Frames.reserve(CS.size());
-      for (::llvm::memprof::FrameId Id : CS)
-        Frames.push_back(IdToFrameCallback(Id));
-    }
-    return Frames;
-  }
-};
-
 TEST_F(InstrProfTest, test_memprof_v2_full_schema) {
   const MemInfoBlock MIB = makeFullMIB();
 
@@ -544,7 +506,7 @@ TEST_F(InstrProfTest, test_memprof_v2_full_schema) {
 
   const IndexedMemProfRecord IndexedMR = makeRecordV2(
       /*AllocFrames=*/{0x111, 0x222},
-      /*CallSiteFrames=*/{0x333}, MIB);
+      /*CallSiteFrames=*/{0x333}, MIB, memprof::getFullSchema());
   const FrameIdMapTy IdToFrameMap = getFrameMapping();
   const auto CSIdToCallStackMap = getCallStackMapping();
   for (const auto &I : IdToFrameMap) {
@@ -562,14 +524,16 @@ TEST_F(InstrProfTest, test_memprof_v2_full_schema) {
   ASSERT_THAT_ERROR(RecordOr.takeError(), Succeeded());
   const memprof::MemProfRecord &Record = RecordOr.get();
 
-  CallStackIdConverter CSIdConv(IdToFrameMap, CSIdToCallStackMap);
+  memprof::FrameIdConverter<decltype(IdToFrameMap)> FrameIdConv(IdToFrameMap);
+  memprof::CallStackIdConverter<decltype(CSIdToCallStackMap)> CSIdConv(
+      CSIdToCallStackMap, FrameIdConv);
 
   const ::llvm::memprof::MemProfRecord WantRecord =
       IndexedMR.toMemProfRecord(CSIdConv);
-  ASSERT_EQ(CSIdConv.LastUnmappedFrameId, std::nullopt)
-      << "could not map frame id: " << *CSIdConv.LastUnmappedFrameId;
-  ASSERT_EQ(CSIdConv.LastUnmappedCSId, std::nullopt)
-      << "could not map call stack id: " << *CSIdConv.LastUnmappedCSId;
+  ASSERT_EQ(FrameIdConv.LastUnmappedId, std::nullopt)
+      << "could not map frame id: " << *FrameIdConv.LastUnmappedId;
+  ASSERT_EQ(CSIdConv.LastUnmappedId, std::nullopt)
+      << "could not map call stack id: " << *CSIdConv.LastUnmappedId;
   EXPECT_THAT(WantRecord, EqualsRecord(Record));
 }
 
@@ -584,7 +548,7 @@ TEST_F(InstrProfTest, test_memprof_v2_partial_schema) {
 
   const IndexedMemProfRecord IndexedMR = makeRecordV2(
       /*AllocFrames=*/{0x111, 0x222},
-      /*CallSiteFrames=*/{0x333}, MIB);
+      /*CallSiteFrames=*/{0x333}, MIB, memprof::getHotColdSchema());
   const FrameIdMapTy IdToFrameMap = getFrameMapping();
   const auto CSIdToCallStackMap = getCallStackMapping();
   for (const auto &I : IdToFrameMap) {
@@ -602,14 +566,16 @@ TEST_F(InstrProfTest, test_memprof_v2_partial_schema) {
   ASSERT_THAT_ERROR(RecordOr.takeError(), Succeeded());
   const memprof::MemProfRecord &Record = RecordOr.get();
 
-  CallStackIdConverter CSIdConv(IdToFrameMap, CSIdToCallStackMap);
+  memprof::FrameIdConverter<decltype(IdToFrameMap)> FrameIdConv(IdToFrameMap);
+  memprof::CallStackIdConverter<decltype(CSIdToCallStackMap)> CSIdConv(
+      CSIdToCallStackMap, FrameIdConv);
 
   const ::llvm::memprof::MemProfRecord WantRecord =
       IndexedMR.toMemProfRecord(CSIdConv);
-  ASSERT_EQ(CSIdConv.LastUnmappedFrameId, std::nullopt)
-      << "could not map frame id: " << *CSIdConv.LastUnmappedFrameId;
-  ASSERT_EQ(CSIdConv.LastUnmappedCSId, std::nullopt)
-      << "could not map call stack id: " << *CSIdConv.LastUnmappedCSId;
+  ASSERT_EQ(FrameIdConv.LastUnmappedId, std::nullopt)
+      << "could not map frame id: " << *FrameIdConv.LastUnmappedId;
+  ASSERT_EQ(CSIdConv.LastUnmappedId, std::nullopt)
+      << "could not map call stack id: " << *CSIdConv.LastUnmappedId;
   EXPECT_THAT(WantRecord, EqualsRecord(Record));
 }
 
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index 98dacd3511e1..40335d191ba7 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -1,6 +1,7 @@
 #include "llvm/ProfileData/MemProf.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
 #include "llvm/IR/Value.h"
@@ -241,7 +242,7 @@ TEST(MemProf, PortableWrapper) {
                     /*dealloc_cpu=*/4);
 
   const auto Schema = llvm::memprof::getFullSchema();
-  PortableMemInfoBlock WriteBlock(Info);
+  PortableMemInfoBlock WriteBlock(Info, Schema);
 
   std::string Buffer;
   llvm::raw_string_ostream OS(Buffer);
@@ -326,6 +327,65 @@ TEST(MemProf, RecordSerializationRoundTripVerion2) {
   EXPECT_EQ(Record, GotRecord);
 }
 
+TEST(MemProf, RecordSerializationRoundTripVersion2HotColdSchema) {
+  const auto Schema = llvm::memprof::getHotColdSchema();
+
+  MemInfoBlock Info;
+  Info.AllocCount = 11;
+  Info.TotalSize = 22;
+  Info.TotalLifetime = 33;
+  Info.TotalLifetimeAccessDensity = 44;
+
+  llvm::SmallVector<llvm::memprof::CallStackId> CallStackIds = {0x123, 0x456};
+
+  llvm::SmallVector<llvm::memprof::CallStackId> CallSiteIds = {0x333, 0x444};
+
+  IndexedMemProfRecord Record;
+  for (const auto &CSId : CallStackIds) {
+    // Use the same info block for both allocation sites.
+    Record.AllocSites.emplace_back(llvm::SmallVector<FrameId>(), CSId, Info,
+                                   Schema);
+  }
+  Record.CallSiteIds.assign(CallSiteIds);
+
+  std::bitset<llvm::to_underlying(Meta::Size)> SchemaBitSet;
+  for (auto Id : Schema)
+    SchemaBitSet.set(llvm::to_underlying(Id));
+
+  // Verify that SchemaBitSet has the fields we expect and nothing else, which
+  // we check with count().
+  EXPECT_EQ(SchemaBitSet.count(), 4U);
+  EXPECT_TRUE(SchemaBitSet[llvm::to_underlying(Meta::AllocCount)]);
+  EXPECT_TRUE(SchemaBitSet[llvm::to_underlying(Meta::TotalSize)]);
+  EXPECT_TRUE(SchemaBitSet[llvm::to_underlying(Meta::TotalLifetime)]);
+  EXPECT_TRUE(
+      SchemaBitSet[llvm::to_underlying(Meta::TotalLifetimeAccessDensity)]);
+
+  // Verify that Schema has propagated all the way to the Info field in each
+  // IndexedAllocationInfo.
+  ASSERT_THAT(Record.AllocSites, ::SizeIs(2));
+  EXPECT_EQ(Record.AllocSites[0].Info.getSchema(), SchemaBitSet);
+  EXPECT_EQ(Record.AllocSites[1].Info.getSchema(), SchemaBitSet);
+
+  std::string Buffer;
+  llvm::raw_string_ostream OS(Buffer);
+  Record.serialize(Schema, OS, llvm::memprof::Version2);
+  OS.flush();
+
+  const IndexedMemProfRecord GotRecord = IndexedMemProfRecord::deserialize(
+      Schema, reinterpret_cast<const unsigned char *>(Buffer.data()),
+      llvm::memprof::Version2);
+
+  // Verify that Schema comes back correctly after deserialization. Technically,
+  // the comparison between Record and GotRecord below includes the comparison
+  // of their Schemas, but we'll verify the Schemas on our own.
+  ASSERT_THAT(GotRecord.AllocSites, ::SizeIs(2));
+  EXPECT_EQ(GotRecord.AllocSites[0].Info.getSchema(), SchemaBitSet);
+  EXPECT_EQ(GotRecord.AllocSites[1].Info.getSchema(), SchemaBitSet);
+
+  EXPECT_EQ(Record, GotRecord);
+}
+
 TEST(MemProf, SymbolizationFilter) {
   std::unique_ptr<MockSymbolizer> Symbolizer(new MockSymbolizer());
 
@@ -502,37 +562,15 @@ TEST(MemProf, IndexedMemProfRecordToMemProfRecord) {
   IndexedRecord.CallSiteIds.push_back(llvm::memprof::hashCallStack(CS3));
   IndexedRecord.CallSiteIds.push_back(llvm::memprof::hashCallStack(CS4));
 
-  bool CSIdMissing = false;
-  bool FrameIdMissing = false;
-
-  auto Callback = [&](CallStackId CSId) -> llvm::SmallVector<Frame> {
-    llvm::SmallVector<Frame> CallStack;
-    llvm::SmallVector<FrameId> FrameIds;
-
-    auto Iter = CallStackIdMap.find(CSId);
-    if (Iter == CallStackIdMap.end())
-      CSIdMissing = true;
-    else
-      FrameIds = Iter->second;
-
-    for (FrameId Id : FrameIds) {
-      Frame F(0, 0, 0, false);
-      auto Iter = FrameIdMap.find(Id);
-      if (Iter == FrameIdMap.end())
-        FrameIdMissing = true;
-      else
-        F = Iter->second;
-      CallStack.push_back(F);
-    }
-
-    return CallStack;
-  };
+  llvm::memprof::FrameIdConverter<decltype(FrameIdMap)> FrameIdConv(FrameIdMap);
+  llvm::memprof::CallStackIdConverter<decltype(CallStackIdMap)> CSIdConv(
+      CallStackIdMap, FrameIdConv);
 
-  MemProfRecord Record = IndexedRecord.toMemProfRecord(Callback);
+  MemProfRecord Record = IndexedRecord.toMemProfRecord(CSIdConv);
 
   // Make sure that all lookups are successful.
-  ASSERT_FALSE(CSIdMissing);
-  ASSERT_FALSE(FrameIdMissing);
+  ASSERT_EQ(FrameIdConv.LastUnmappedId, std::nullopt);
+  ASSERT_EQ(CSIdConv.LastUnmappedId, std::nullopt);
 
   // Verify the contents of Record.
   ASSERT_THAT(Record.AllocSites, SizeIs(2));
diff --git a/llvm/unittests/Support/YAMLIOTest.cpp b/llvm/unittests/Support/YAMLIOTest.cpp
index 401981f3841e..6ac0d1b412f0 100644
--- a/llvm/unittests/Support/YAMLIOTest.cpp
+++ b/llvm/unittests/Support/YAMLIOTest.cpp
@@ -2906,6 +2906,87 @@ TEST(YAMLIO, Numeric) {
 }
 
 //===----------------------------------------------------------------------===//
+//  Test writing and reading escaped keys
+//===----------------------------------------------------------------------===//
+
+// Struct with dynamic string key
+struct QuotedKeyStruct {
+  int unquoted_bool;
+  int unquoted_null;
+  int unquoted_numeric;
+  int unquoted_str;
+  int colon;
+  int just_space;
+  int unprintable;
+};
+
+namespace llvm {
+namespace yaml {
+template <> struct MappingTraits<QuotedKeyStruct> {
+  static void mapping(IO &io, QuotedKeyStruct &map) {
+    io.mapRequired("true", map.unquoted_bool);
+    io.mapRequired("null", map.unquoted_null);
+    io.mapRequired("42", map.unquoted_numeric);
+    io.mapRequired("unquoted", map.unquoted_str);
+    io.mapRequired(":", map.colon);
+    io.mapRequired(" ", map.just_space);
+    char unprintableKey[] = {/* \f, form-feed */ 0xC, 0};
+    io.mapRequired(unprintableKey, map.unprintable);
+  }
+};
+} // namespace yaml
+} // namespace llvm
+
+TEST(YAMLIO, TestQuotedKeyRead) {
+  QuotedKeyStruct map = {};
+  Input yin("---\ntrue:  1\nnull:  2\n42:  3\nunquoted:  4\n':':  5\n' ':  "
+            "6\n\"\\f\":  7\n...\n");
+  yin >> map;
+
+  EXPECT_FALSE(yin.error());
+  EXPECT_EQ(map.unquoted_bool, 1);
+  EXPECT_EQ(map.unquoted_null, 2);
+  EXPECT_EQ(map.unquoted_numeric, 3);
+  EXPECT_EQ(map.unquoted_str, 4);
+  EXPECT_EQ(map.colon, 5);
+  EXPECT_EQ(map.just_space, 6);
+  EXPECT_EQ(map.unprintable, 7);
+}
+
+TEST(YAMLIO, TestQuotedKeyWriteRead) {
+  std::string intermediate;
+  {
+    QuotedKeyStruct map = {1, 2, 3, 4, 5, 6, 7};
+    llvm::raw_string_ostream ostr(intermediate);
+    Output yout(ostr);
+    yout << map;
+  }
+
+  EXPECT_NE(std::string::npos, intermediate.find("true:"));
+  EXPECT_NE(std::string::npos, intermediate.find("null:"));
+  EXPECT_NE(std::string::npos, intermediate.find("42:"));
+  EXPECT_NE(std::string::npos, intermediate.find("unquoted:"));
+  EXPECT_NE(std::string::npos, intermediate.find("':':"));
+  EXPECT_NE(std::string::npos, intermediate.find("' '"));
+  EXPECT_NE(std::string::npos, intermediate.find("\"\\f\":"));
+
+  {
+    Input yin(intermediate);
+    QuotedKeyStruct map;
+    yin >> map;
+
+    EXPECT_FALSE(yin.error());
+    EXPECT_EQ(map.unquoted_bool, 1);
+    EXPECT_EQ(map.unquoted_null, 2);
+    EXPECT_EQ(map.unquoted_numeric, 3);
+    EXPECT_EQ(map.unquoted_str, 4);
+    EXPECT_EQ(map.colon, 5);
+    EXPECT_EQ(map.just_space, 6);
+    EXPECT_EQ(map.unprintable, 7);
+  }
+}
+
+//===----------------------------------------------------------------------===//
 //  Test PolymorphicTraits and TaggedScalarTraits
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index 81b7e2e527d9..9f23000d733d 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -120,10 +120,14 @@ TEST(ParseArchString, RejectsInvalidBaseISA) {
     EXPECT_EQ(toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
               "string must begin with rv32{i,e,g} or rv64{i,e,g}");
   }
-  for (StringRef Input : {"rv32j", "rv64k", "rv32_i"}) {
+
+  for (StringRef Input : {"rv32j", "rv32_i"}) {
     EXPECT_EQ(toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
-              "first letter should be 'e', 'i' or 'g'");
+              "first letter after 'rv32' should be 'e', 'i' or 'g'");
   }
+
+  EXPECT_EQ(toString(RISCVISAInfo::parseArchString("rv64k", true).takeError()),
+            "first letter after 'rv64' should be 'e', 'i' or 'g'");
 }
 
 TEST(ParseArchString, RejectsUnsupportedBaseISA) {
@@ -137,7 +141,7 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) {
   auto MaybeRV32I = RISCVISAInfo::parseArchString("rv32i", true);
   ASSERT_THAT_EXPECTED(MaybeRV32I, Succeeded());
   RISCVISAInfo &InfoRV32I = **MaybeRV32I;
-  RISCVISAInfo::OrderedExtensionMap ExtsRV32I = InfoRV32I.getExtensions();
+  const auto &ExtsRV32I = InfoRV32I.getExtensions();
   EXPECT_EQ(ExtsRV32I.size(), 1UL);
   EXPECT_TRUE(ExtsRV32I.at("i") == (RISCVISAUtils::ExtensionVersion{2, 1}));
   EXPECT_EQ(InfoRV32I.getXLen(), 32U);
@@ -146,7 +150,7 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) {
   auto MaybeRV32E = RISCVISAInfo::parseArchString("rv32e", true);
   ASSERT_THAT_EXPECTED(MaybeRV32E, Succeeded());
   RISCVISAInfo &InfoRV32E = **MaybeRV32E;
-  RISCVISAInfo::OrderedExtensionMap ExtsRV32E = InfoRV32E.getExtensions();
+  const auto &ExtsRV32E = InfoRV32E.getExtensions();
   EXPECT_EQ(ExtsRV32E.size(), 1UL);
   EXPECT_TRUE(ExtsRV32E.at("e") == (RISCVISAUtils::ExtensionVersion{2, 0}));
   EXPECT_EQ(InfoRV32E.getXLen(), 32U);
@@ -155,7 +159,7 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) {
   auto MaybeRV32G = RISCVISAInfo::parseArchString("rv32g", true);
   ASSERT_THAT_EXPECTED(MaybeRV32G, Succeeded());
   RISCVISAInfo &InfoRV32G = **MaybeRV32G;
-  RISCVISAInfo::OrderedExtensionMap ExtsRV32G = InfoRV32G.getExtensions();
+  const auto &ExtsRV32G = InfoRV32G.getExtensions();
   EXPECT_EQ(ExtsRV32G.size(), 7UL);
   EXPECT_TRUE(ExtsRV32G.at("i") == (RISCVISAUtils::ExtensionVersion{2, 1}));
   EXPECT_TRUE(ExtsRV32G.at("m") == (RISCVISAUtils::ExtensionVersion{2, 0}));
@@ -171,7 +175,7 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) {
   auto MaybeRV64I = RISCVISAInfo::parseArchString("rv64i", true);
   ASSERT_THAT_EXPECTED(MaybeRV64I, Succeeded());
   RISCVISAInfo &InfoRV64I = **MaybeRV64I;
-  RISCVISAInfo::OrderedExtensionMap ExtsRV64I = InfoRV64I.getExtensions();
+  const auto &ExtsRV64I = InfoRV64I.getExtensions();
   EXPECT_EQ(ExtsRV64I.size(), 1UL);
   EXPECT_TRUE(ExtsRV64I.at("i") == (RISCVISAUtils::ExtensionVersion{2, 1}));
   EXPECT_EQ(InfoRV64I.getXLen(), 64U);
@@ -180,7 +184,7 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) {
   auto MaybeRV64E = RISCVISAInfo::parseArchString("rv64e", true);
   ASSERT_THAT_EXPECTED(MaybeRV64E, Succeeded());
   RISCVISAInfo &InfoRV64E = **MaybeRV64E;
-  RISCVISAInfo::OrderedExtensionMap ExtsRV64E = InfoRV64E.getExtensions();
+  const auto &ExtsRV64E = InfoRV64E.getExtensions();
   EXPECT_EQ(ExtsRV64E.size(), 1UL);
   EXPECT_TRUE(ExtsRV64E.at("e") == (RISCVISAUtils::ExtensionVersion{2, 0}));
   EXPECT_EQ(InfoRV64E.getXLen(), 64U);
@@ -189,7 +193,7 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) {
   auto MaybeRV64G = RISCVISAInfo::parseArchString("rv64g", true);
   ASSERT_THAT_EXPECTED(MaybeRV64G, Succeeded());
   RISCVISAInfo &InfoRV64G = **MaybeRV64G;
-  RISCVISAInfo::OrderedExtensionMap ExtsRV64G = InfoRV64G.getExtensions();
+  const auto &ExtsRV64G = InfoRV64G.getExtensions();
   EXPECT_EQ(ExtsRV64G.size(), 7UL);
   EXPECT_TRUE(ExtsRV64G.at("i") == (RISCVISAUtils::ExtensionVersion{2, 1}));
   EXPECT_TRUE(ExtsRV64G.at("m") == (RISCVISAUtils::ExtensionVersion{2, 0}));
@@ -241,7 +245,7 @@ TEST(ParseArchString, IgnoresUnrecognizedExtensionNamesWithIgnoreUnknown) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true, false, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
     RISCVISAInfo &Info = **MaybeISAInfo;
-    RISCVISAInfo::OrderedExtensionMap Exts = Info.getExtensions();
+    const auto &Exts = Info.getExtensions();
     EXPECT_EQ(Exts.size(), 1UL);
     EXPECT_TRUE(Exts.at("i") == (RISCVISAUtils::ExtensionVersion{2, 1}));
   }
@@ -251,7 +255,7 @@ TEST(ParseArchString, IgnoresUnrecognizedExtensionNamesWithIgnoreUnknown) {
   auto MaybeISAInfo =
       RISCVISAInfo::parseArchString("rv32i_zbc1p0_xmadeup", true, false, true);
   ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+  const auto &Exts = (*MaybeISAInfo)->getExtensions();
   EXPECT_TRUE(Exts.at("zbc") == (RISCVISAUtils::ExtensionVersion{1, 0}));
 }
 
@@ -259,13 +263,13 @@ TEST(ParseArchString, AcceptsVersionInLongOrShortForm) {
   for (StringRef Input : {"rv64i2p1"}) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-    RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+    const auto &Exts = (*MaybeISAInfo)->getExtensions();
     EXPECT_TRUE(Exts.at("i") == (RISCVISAUtils::ExtensionVersion{2, 1}));
   }
   for (StringRef Input : {"rv32i_zfinx1", "rv32i_zfinx1p0"}) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-    RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+    const auto &Exts = (*MaybeISAInfo)->getExtensions();
     EXPECT_TRUE(Exts.at("zfinx") == (RISCVISAUtils::ExtensionVersion{1, 0}));
   }
 }
@@ -293,14 +297,14 @@ TEST(ParseArchString,
   for (StringRef Input : {"rv32i0p1", "rv32i99p99", "rv64i0p1", "rv64i99p99"}) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true, false, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-    RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+    const auto &Exts = (*MaybeISAInfo)->getExtensions();
     EXPECT_EQ(Exts.size(), 1UL);
     EXPECT_TRUE(Exts.at("i") == (RISCVISAUtils::ExtensionVersion{2, 1}));
   }
   for (StringRef Input : {"rv32e0p1", "rv32e99p99", "rv64e0p1", "rv64e99p99"}) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true, false, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-    RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+    const auto &Exts = (*MaybeISAInfo)->getExtensions();
     EXPECT_EQ(Exts.size(), 1UL);
     EXPECT_TRUE(Exts.at("e") == (RISCVISAUtils::ExtensionVersion{2, 0}));
   }
@@ -311,7 +315,7 @@ TEST(ParseArchString,
   for (StringRef Input : {"rv32im1p1", "rv64i_svnapot10p9", "rv32i_zicsr0p5"}) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true, false, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-    RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+    const auto &Exts = (*MaybeISAInfo)->getExtensions();
     EXPECT_EQ(Exts.size(), 1UL);
     EXPECT_TRUE(Exts.at("i") == (RISCVISAUtils::ExtensionVersion{2, 1}));
   }
@@ -321,7 +325,7 @@ TEST(ParseArchString, AcceptsUnderscoreSplittingExtensions) {
   for (StringRef Input : {"rv32imafdczifencei", "rv32i_m_a_f_d_c_zifencei"}) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-    RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+    const auto &Exts = (*MaybeISAInfo)->getExtensions();
     EXPECT_EQ(Exts.size(), 8UL);
     EXPECT_EQ(Exts.count("i"), 1U);
     EXPECT_EQ(Exts.count("m"), 1U);
@@ -339,7 +343,7 @@ TEST(ParseArchString, AcceptsRelaxSingleLetterExtensions) {
        {"rv32imfad", "rv32im_fa_d", "rv32im2p0fad", "rv32i2p1m2p0fad"}) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-    RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+    const auto &Exts = (*MaybeISAInfo)->getExtensions();
     EXPECT_EQ(Exts.size(), 6UL);
     EXPECT_EQ(Exts.count("i"), 1U);
     EXPECT_EQ(Exts.count("m"), 1U);
@@ -356,7 +360,7 @@ TEST(ParseArchString, AcceptsRelaxMixedLetterExtensions) {
         "rv32i_zihintntl_mafd_svinval"}) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-    RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+    const auto &Exts = (*MaybeISAInfo)->getExtensions();
     EXPECT_EQ(Exts.size(), 8UL);
     EXPECT_EQ(Exts.count("i"), 1U);
     EXPECT_EQ(Exts.count("m"), 1U);
@@ -373,7 +377,7 @@ TEST(ParseArchString, AcceptsAmbiguousFromRelaxExtensions) {
   for (StringRef Input : {"rv32i_zba_m", "rv32izba_m", "rv32izba1p0_m2p0"}) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-    RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+    const auto &Exts = (*MaybeISAInfo)->getExtensions();
     EXPECT_EQ(Exts.size(), 3UL);
     EXPECT_EQ(Exts.count("i"), 1U);
     EXPECT_EQ(Exts.count("zba"), 1U);
@@ -383,7 +387,7 @@ TEST(ParseArchString, AcceptsAmbiguousFromRelaxExtensions) {
        {"rv32ia_zba_m", "rv32iazba_m", "rv32ia2p1zba1p0_m2p0"}) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-    RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+    const auto &Exts = (*MaybeISAInfo)->getExtensions();
     EXPECT_EQ(Exts.size(), 4UL);
     EXPECT_EQ(Exts.count("i"), 1U);
     EXPECT_EQ(Exts.count("zba"), 1U);
@@ -395,7 +399,7 @@ TEST(ParseArchString, AcceptsAmbiguousFromRelaxExtensions) {
 TEST(ParseArchString, RejectsRelaxExtensionsNotStartWithEorIorG) {
   EXPECT_EQ(
       toString(RISCVISAInfo::parseArchString("rv32zba_im", true).takeError()),
-      "first letter should be 'e', 'i' or 'g'");
+      "first letter after 'rv32' should be 'e', 'i' or 'g'");
 }
 
 TEST(ParseArchString,
@@ -457,12 +461,12 @@ TEST(ParseArchString,
   // hopefully serve as a reminder to update.
   auto MaybeISAInfo = RISCVISAInfo::parseArchString("rv64iztso", true, false);
   ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+  const auto &Exts = (*MaybeISAInfo)->getExtensions();
   EXPECT_EQ(Exts.size(), 2UL);
   EXPECT_EQ(Exts.count("ztso"), 1U);
   auto MaybeISAInfo2 = RISCVISAInfo::parseArchString("rv64iztso0p1", true);
   ASSERT_THAT_EXPECTED(MaybeISAInfo2, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap Exts2 = (*MaybeISAInfo2)->getExtensions();
+  const auto &Exts2 = (*MaybeISAInfo2)->getExtensions();
   EXPECT_EQ(Exts2.size(), 2UL);
   EXPECT_EQ(Exts2.count("ztso"), 1U);
 }
@@ -479,7 +483,7 @@ TEST(ParseArchString,
   auto MaybeISAInfo =
       RISCVISAInfo::parseArchString("rv64iztso9p9", true, false);
   ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap Exts = (*MaybeISAInfo)->getExtensions();
+  const auto &Exts = (*MaybeISAInfo)->getExtensions();
   EXPECT_EQ(Exts.size(), 2UL);
   EXPECT_TRUE(Exts.at("ztso") == (RISCVISAUtils::ExtensionVersion{9, 9}));
 }
@@ -502,8 +506,7 @@ TEST(ParseArchString, AddsImpliedExtensions) {
   // Does not attempt to exhaustively test all implications.
   auto MaybeRV64ID = RISCVISAInfo::parseArchString("rv64id", true);
   ASSERT_THAT_EXPECTED(MaybeRV64ID, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap ExtsRV64ID =
-      (*MaybeRV64ID)->getExtensions();
+  const auto &ExtsRV64ID = (*MaybeRV64ID)->getExtensions();
   EXPECT_EQ(ExtsRV64ID.size(), 4UL);
   EXPECT_EQ(ExtsRV64ID.count("i"), 1U);
   EXPECT_EQ(ExtsRV64ID.count("f"), 1U);
@@ -512,8 +515,7 @@ TEST(ParseArchString, AddsImpliedExtensions) {
 
   auto MaybeRV32IZKN = RISCVISAInfo::parseArchString("rv64izkn", true);
   ASSERT_THAT_EXPECTED(MaybeRV32IZKN, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap ExtsRV32IZKN =
-      (*MaybeRV32IZKN)->getExtensions();
+  const auto &ExtsRV32IZKN = (*MaybeRV32IZKN)->getExtensions();
   EXPECT_EQ(ExtsRV32IZKN.size(), 8UL);
   EXPECT_EQ(ExtsRV32IZKN.count("i"), 1U);
   EXPECT_EQ(ExtsRV32IZKN.count("zbkb"), 1U);
@@ -603,7 +605,7 @@ TEST(ToFeatures, AddAllExtensionsAddsNegativeExtensions) {
 }
 
 TEST(OrderedExtensionMap, ExtensionsAreCorrectlyOrdered) {
-  RISCVISAInfo::OrderedExtensionMap Exts;
+  RISCVISAUtils::OrderedExtensionMap Exts;
   for (auto ExtName : {"y", "l", "m", "c", "i", "xfoo", "xbar", "sfoo", "sbar",
                        "zmfoo", "zzfoo", "zfinx", "zicsr"})
     Exts[ExtName] = {1, 0};
@@ -621,8 +623,7 @@ TEST(OrderedExtensionMap, ExtensionsAreCorrectlyOrdered) {
 TEST(ParseArchString, ZceImplication) {
   auto MaybeRV32IZce = RISCVISAInfo::parseArchString("rv32izce", true);
   ASSERT_THAT_EXPECTED(MaybeRV32IZce, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap ExtsRV32IZce =
-      (*MaybeRV32IZce)->getExtensions();
+  const auto &ExtsRV32IZce = (*MaybeRV32IZce)->getExtensions();
   EXPECT_EQ(ExtsRV32IZce.size(), 7UL);
   EXPECT_EQ(ExtsRV32IZce.count("i"), 1U);
   EXPECT_EQ(ExtsRV32IZce.count("zicsr"), 1U);
@@ -634,8 +635,7 @@ TEST(ParseArchString, ZceImplication) {
 
   auto MaybeRV32IFZce = RISCVISAInfo::parseArchString("rv32ifzce", true);
   ASSERT_THAT_EXPECTED(MaybeRV32IFZce, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap ExtsRV32IFZce =
-      (*MaybeRV32IFZce)->getExtensions();
+  const auto &ExtsRV32IFZce = (*MaybeRV32IFZce)->getExtensions();
   EXPECT_EQ(ExtsRV32IFZce.size(), 9UL);
   EXPECT_EQ(ExtsRV32IFZce.count("i"), 1U);
   EXPECT_EQ(ExtsRV32IFZce.count("zicsr"), 1U);
@@ -649,8 +649,7 @@ TEST(ParseArchString, ZceImplication) {
 
   auto MaybeRV32IDZce = RISCVISAInfo::parseArchString("rv32idzce", true);
   ASSERT_THAT_EXPECTED(MaybeRV32IDZce, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap ExtsRV32IDZce =
-      (*MaybeRV32IDZce)->getExtensions();
+  const auto &ExtsRV32IDZce = (*MaybeRV32IDZce)->getExtensions();
   EXPECT_EQ(ExtsRV32IDZce.size(), 10UL);
   EXPECT_EQ(ExtsRV32IDZce.count("i"), 1U);
   EXPECT_EQ(ExtsRV32IDZce.count("zicsr"), 1U);
@@ -665,8 +664,7 @@ TEST(ParseArchString, ZceImplication) {
 
   auto MaybeRV64IZce = RISCVISAInfo::parseArchString("rv64izce", true);
   ASSERT_THAT_EXPECTED(MaybeRV64IZce, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap ExtsRV64IZce =
-      (*MaybeRV64IZce)->getExtensions();
+  const auto &ExtsRV64IZce = (*MaybeRV64IZce)->getExtensions();
   EXPECT_EQ(ExtsRV64IZce.size(), 7UL);
   EXPECT_EQ(ExtsRV64IZce.count("i"), 1U);
   EXPECT_EQ(ExtsRV64IZce.count("zicsr"), 1U);
@@ -678,8 +676,7 @@ TEST(ParseArchString, ZceImplication) {
 
   auto MaybeRV64IFZce = RISCVISAInfo::parseArchString("rv64ifzce", true);
   ASSERT_THAT_EXPECTED(MaybeRV64IFZce, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap ExtsRV64IFZce =
-      (*MaybeRV64IFZce)->getExtensions();
+  const auto &ExtsRV64IFZce = (*MaybeRV64IFZce)->getExtensions();
   EXPECT_EQ(ExtsRV64IFZce.size(), 8UL);
   EXPECT_EQ(ExtsRV64IFZce.count("i"), 1U);
   EXPECT_EQ(ExtsRV64IFZce.count("zicsr"), 1U);
@@ -698,8 +695,7 @@ TEST(ParseArchString, ZceImplication) {
 
   auto MaybeRV64IDZce = RISCVISAInfo::parseArchString("rv64idzce", true);
   ASSERT_THAT_EXPECTED(MaybeRV64IDZce, Succeeded());
-  RISCVISAInfo::OrderedExtensionMap ExtsRV64IDZce =
-      (*MaybeRV64IDZce)->getExtensions();
+  const auto &ExtsRV64IDZce = (*MaybeRV64IDZce)->getExtensions();
   EXPECT_EQ(ExtsRV64IDZce.size(), 9UL);
   EXPECT_EQ(ExtsRV64IDZce.count("i"), 1U);
   EXPECT_EQ(ExtsRV64IDZce.count("zicsr"), 1U);
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 2c72a7229b52..75e235008b4f 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1346,6 +1346,44 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_PAUTH}),
             "9-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
+            "neoverse-v3", "armv9.2-a", "neon-fp-armv8",
+            AArch64::ExtensionBitset(
+                {AArch64::AEK_BF16,        AArch64::AEK_I8MM,
+                 AArch64::AEK_SVE,         AArch64::AEK_SVE2,
+                 AArch64::AEK_FP16,        AArch64::AEK_DOTPROD,
+                 AArch64::AEK_LSE,         AArch64::AEK_RDM,
+                 AArch64::AEK_SIMD,        AArch64::AEK_RCPC,
+                 AArch64::AEK_RAS,         AArch64::AEK_CRC,
+                 AArch64::AEK_FP,          AArch64::AEK_PROFILE,
+                 AArch64::AEK_MTE,         AArch64::AEK_SSBS,
+                 AArch64::AEK_SB,          AArch64::AEK_PREDRES,
+                 AArch64::AEK_LS64,        AArch64::AEK_BRBE,
+                 AArch64::AEK_PAUTH,       AArch64::AEK_FLAGM,
+                 AArch64::AEK_PERFMON,     AArch64::AEK_RAND,
+                 AArch64::AEK_SVE2BITPERM, AArch64::AEK_FP16FML,
+                 AArch64::AEK_PROFILE,     AArch64::AEK_JSCVT,
+                 AArch64::AEK_FCMA}),
+            "9.2-A"),
+        ARMCPUTestParams<AArch64::ExtensionBitset>(
+            "neoverse-v3ae", "armv9.2-a", "neon-fp-armv8",
+            AArch64::ExtensionBitset(
+                {AArch64::AEK_BF16,        AArch64::AEK_I8MM,
+                 AArch64::AEK_SVE,         AArch64::AEK_SVE2,
+                 AArch64::AEK_FP16,        AArch64::AEK_DOTPROD,
+                 AArch64::AEK_LSE,         AArch64::AEK_RDM,
+                 AArch64::AEK_SIMD,        AArch64::AEK_RCPC,
+                 AArch64::AEK_RAS,         AArch64::AEK_CRC,
+                 AArch64::AEK_FP,          AArch64::AEK_PROFILE,
+                 AArch64::AEK_MTE,         AArch64::AEK_SSBS,
+                 AArch64::AEK_SB,          AArch64::AEK_PREDRES,
+                 AArch64::AEK_LS64,        AArch64::AEK_BRBE,
+                 AArch64::AEK_PAUTH,       AArch64::AEK_FLAGM,
+                 AArch64::AEK_PERFMON,     AArch64::AEK_RAND,
+                 AArch64::AEK_SVE2BITPERM, AArch64::AEK_FP16FML,
+                 AArch64::AEK_PROFILE,     AArch64::AEK_JSCVT,
+                 AArch64::AEK_FCMA}),
+            "9.2-A"),
+        ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-r82", "armv8-r", "crypto-neon-fp-armv8",
             AArch64::ExtensionBitset(
                 {AArch64::AEK_CRC, AArch64::AEK_RDM, AArch64::AEK_SSBS,
@@ -1637,6 +1675,24 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_FP16FML}),
             "9-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
+            "neoverse-n3", "armv9.2-a", "neon-fp-armv8",
+            AArch64::ExtensionBitset(
+                {AArch64::AEK_BF16,    AArch64::AEK_I8MM,
+                 AArch64::AEK_SVE,     AArch64::AEK_SVE2,
+                 AArch64::AEK_FP16,    AArch64::AEK_DOTPROD,
+                 AArch64::AEK_LSE,     AArch64::AEK_RDM,
+                 AArch64::AEK_SIMD,    AArch64::AEK_RCPC,
+                 AArch64::AEK_RAS,     AArch64::AEK_CRC,
+                 AArch64::AEK_FP,      AArch64::AEK_PROFILE,
+                 AArch64::AEK_MTE,     AArch64::AEK_SSBS,
+                 AArch64::AEK_SB,      AArch64::AEK_PREDRES,
+                 AArch64::AEK_FCMA,    AArch64::AEK_PAUTH,
+                 AArch64::AEK_FLAGM,   AArch64::AEK_PERFMON,
+                 AArch64::AEK_RAND,    AArch64::AEK_SVE2BITPERM,
+                 AArch64::AEK_FP16FML, AArch64::AEK_PROFILE,
+                 AArch64::AEK_JSCVT}),
+            "9.2-A"),
+        ARMCPUTestParams<AArch64::ExtensionBitset>(
             "ampere1", "armv8.6-a", "crypto-neon-fp-armv8",
             AArch64::ExtensionBitset(
                 {AArch64::AEK_CRC,  AArch64::AEK_FP,   AArch64::AEK_FP16,
@@ -1750,7 +1806,7 @@ INSTANTIATE_TEST_SUITE_P(
     ARMCPUTestParams<AArch64::ExtensionBitset>::PrintToStringParamName);
 
 // Note: number of CPUs includes aliases.
-static constexpr unsigned NumAArch64CPUArchs = 72;
+static constexpr unsigned NumAArch64CPUArchs = 75;
 
 TEST(TargetParserTest, testAArch64CPUArchList) {
   SmallVector<StringRef, NumAArch64CPUArchs> List;
diff --git a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
index dcecac4380ce..ff508d648733 100644
--- a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -1352,7 +1352,7 @@ void llvm::EmitMatcherTable(Matcher *TheMatcher, const CodeGenDAGPatterns &CGP,
   MatcherEmitter.EmitHistogram(TheMatcher, OS);
 
   OS << "  #undef TARGET_VAL\n";
-  OS << "  SelectCodeCommon(N, MatcherTable,sizeof(MatcherTable));\n";
+  OS << "  SelectCodeCommon(N, MatcherTable, sizeof(MatcherTable));\n";
   OS << "}\n";
   EndEmitFunction(OS);
 
diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp
index f2504775d557..0439df8067ed 100644
--- a/llvm/utils/TableGen/DXILEmitter.cpp
+++ b/llvm/utils/TableGen/DXILEmitter.cpp
@@ -97,7 +97,7 @@ static ParameterKind getParameterKind(const Record *R) {
     if (R->getValueAsInt("isHalfOrFloat") || R->getValueAsInt("isI16OrI32")) {
       return ParameterKind::Overload;
     }
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   default:
     llvm_unreachable("Support for specified DXIL Type not yet implemented");
   }
@@ -272,7 +272,7 @@ static std::string getOverloadKindStr(const Record *R) {
         return "OverloadKind::I16 | OverloadKind::I32";
       }
     }
-    LLVM_FALLTHROUGH;
+    [[fallthrough]];
   default:
     llvm_unreachable(
         "Support for specified parameter OverloadKind not yet implemented");
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index 78abf80e7aec..0eb258ff89a2 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -834,6 +834,11 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
       return InsnMatcher;
     }
 
+    if (SrcGIOrNull->TheDef->getName() == "G_FRAME_INDEX") {
+      InsnMatcher.addOperand(OpIdx++, Src.getName(), TempOpIdx);
+      return InsnMatcher;
+    }
+
     // Special case because the operand order is changed from setcc. The
     // predicate operand needs to be swapped from the last operand to the first
     // source.
@@ -1223,6 +1228,10 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
     if (DstChild.getOperator()->getName() == "timm") {
       DstMIBuilder.addRenderer<CopyRenderer>(DstChild.getName());
       return InsertPt;
+    }
+    if (DstChild.getOperator()->getName() == "tframeindex") {
+      DstMIBuilder.addRenderer<CopyRenderer>(DstChild.getName());
+      return InsertPt;
     } else if (DstChild.getOperator()->getName() == "imm") {
       DstMIBuilder.addRenderer<CopyConstantAsImmRenderer>(DstChild.getName());
       return InsertPt;
diff --git a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
index 217b531dcfd3..097e1deb3ed1 100644
--- a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
@@ -43,16 +43,6 @@ static void printExtensionTable(raw_ostream &OS,
   OS << "};\n\n";
 }
 
-// Get the extension name from the Record name. This gives the canonical
-// capitalization.
-static StringRef getExtensionNameFromRecordName(const Record *R) {
-  StringRef Name = R->getName();
-  if (!Name.consume_front("FeatureStdExt"))
-    Name.consume_front("FeatureVendor");
-
-  return Name;
-}
-
 static void emitRISCVExtensions(RecordKeeper &Records, raw_ostream &OS) {
   OS << "#ifdef GET_SUPPORTED_EXTENSIONS\n";
   OS << "#undef GET_SUPPORTED_EXTENSIONS\n\n";
@@ -71,33 +61,21 @@ static void emitRISCVExtensions(RecordKeeper &Records, raw_ostream &OS) {
   OS << "#ifdef GET_IMPLIED_EXTENSIONS\n";
   OS << "#undef GET_IMPLIED_EXTENSIONS\n\n";
 
+  OS << "\nstatic constexpr ImpliedExtsEntry ImpliedExts[] = {\n";
   for (Record *Ext : Extensions) {
     auto ImpliesList = Ext->getValueAsListOfDefs("Implies");
     if (ImpliesList.empty())
       continue;
 
-    OS << "static const char *ImpliedExts"
-       << getExtensionNameFromRecordName(Ext) << "[] = {";
+    StringRef Name = getExtensionName(Ext);
 
-    ListSeparator LS(", ");
     for (auto *ImpliedExt : ImpliesList) {
       if (!ImpliedExt->isSubClassOf("RISCVExtension"))
         continue;
 
-      OS << LS << '"' << getExtensionName(ImpliedExt) << '"';
+      OS << "    { {\"" << Name << "\"}, \"" << getExtensionName(ImpliedExt)
+         << "\"},\n";
     }
-
-    OS << "};\n";
-  }
-
-  OS << "\nstatic constexpr ImpliedExtsEntry ImpliedExts[] = {\n";
-  for (Record *Ext : Extensions) {
-    auto ImpliesList = Ext->getValueAsListOfDefs("Implies");
-    if (ImpliesList.empty())
-      continue;
-
-    OS << "    { {\"" << getExtensionName(Ext) << "\"}, {ImpliedExts"
-       << getExtensionNameFromRecordName(Ext) << "} },\n";
   }
 
   OS << "};\n\n";
@@ -111,15 +89,13 @@ static void emitRISCVExtensions(RecordKeeper &Records, raw_ostream &OS) {
 //
 // This is almost the same as RISCVFeatures::parseFeatureBits, except that we
 // get feature name from feature records instead of feature bits.
-static void printMArch(raw_ostream &OS, const Record &Rec) {
-  std::map<std::string, RISCVISAUtils::ExtensionVersion,
-           RISCVISAUtils::ExtensionComparator>
-      Extensions;
+static void printMArch(raw_ostream &OS, const std::vector<Record *> &Features) {
+  RISCVISAUtils::OrderedExtensionMap Extensions;
   unsigned XLen = 0;
 
   // Convert features to FeatureVector.
-  for (auto *Feature : Rec.getValueAsListOfDefs("Features")) {
-    StringRef FeatureName = Feature->getValueAsString("Name");
+  for (auto *Feature : Features) {
+    StringRef FeatureName = getExtensionName(Feature);
     if (Feature->isSubClassOf("RISCVExtension")) {
       unsigned Major = Feature->getValueAsInt("MajorVersion");
       unsigned Minor = Feature->getValueAsInt("MinorVersion");
@@ -142,6 +118,23 @@ static void printMArch(raw_ostream &OS, const Record &Rec) {
     OS << LS << Ext.first << Ext.second.Major << 'p' << Ext.second.Minor;
 }
 
+static void emitRISCVProfiles(RecordKeeper &Records, raw_ostream &OS) {
+  OS << "#ifdef GET_SUPPORTED_PROFILES\n";
+  OS << "#undef GET_SUPPORTED_PROFILES\n\n";
+
+  OS << "static constexpr RISCVProfile SupportedProfiles[] = {\n";
+
+  for (const Record *Rec : Records.getAllDerivedDefinitions("RISCVProfile")) {
+    OS.indent(4) << "{\"" << Rec->getValueAsString("Name") << "\",\"";
+    printMArch(OS, Rec->getValueAsListOfDefs("Implies"));
+    OS << "\"},\n";
+  }
+
+  OS << "};\n\n";
+
+  OS << "#endif // GET_SUPPORTED_PROFILES\n\n";
+}
+
 static void emitRISCVProcs(RecordKeeper &RK, raw_ostream &OS) {
   OS << "#ifndef PROC\n"
      << "#define PROC(ENUM, NAME, DEFAULT_MARCH, FAST_UNALIGNED_ACCESS)\n"
@@ -149,15 +142,15 @@ static void emitRISCVProcs(RecordKeeper &RK, raw_ostream &OS) {
 
   // Iterate on all definition records.
   for (const Record *Rec : RK.getAllDerivedDefinitions("RISCVProcessorModel")) {
-    bool FastScalarUnalignedAccess =
-        any_of(Rec->getValueAsListOfDefs("Features"), [&](auto &Feature) {
-          return Feature->getValueAsString("Name") == "unaligned-scalar-mem";
-        });
+    const std::vector<Record *> &Features =
+        Rec->getValueAsListOfDefs("Features");
+    bool FastScalarUnalignedAccess = any_of(Features, [&](auto &Feature) {
+      return Feature->getValueAsString("Name") == "unaligned-scalar-mem";
+    });
 
-    bool FastVectorUnalignedAccess =
-        any_of(Rec->getValueAsListOfDefs("Features"), [&](auto &Feature) {
-          return Feature->getValueAsString("Name") == "unaligned-vector-mem";
-        });
+    bool FastVectorUnalignedAccess = any_of(Features, [&](auto &Feature) {
+      return Feature->getValueAsString("Name") == "unaligned-vector-mem";
+    });
 
     bool FastUnalignedAccess =
         FastScalarUnalignedAccess && FastVectorUnalignedAccess;
@@ -169,7 +162,7 @@ static void emitRISCVProcs(RecordKeeper &RK, raw_ostream &OS) {
 
     // Compute MArch from features if we don't specify it.
     if (MArch.empty())
-      printMArch(OS, *Rec);
+      printMArch(OS, Features);
     else
       OS << MArch;
     OS << "\"}, " << FastUnalignedAccess << ")\n";
@@ -191,6 +184,7 @@ static void emitRISCVProcs(RecordKeeper &RK, raw_ostream &OS) {
 
 static void EmitRISCVTargetDef(RecordKeeper &RK, raw_ostream &OS) {
   emitRISCVExtensions(RK, OS);
+  emitRISCVProfiles(RK, OS);
   emitRISCVProcs(RK, OS);
 }
 
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
index cae491a34331..7e873532b9ab 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
@@ -50,6 +50,7 @@ static_library("LLVMHexagonCodeGen") {
     "HexagonCommonGEP.cpp",
     "HexagonConstExtenders.cpp",
     "HexagonConstPropagation.cpp",
+    "HexagonCopyHoisting.cpp",
     "HexagonCopyToCombine.cpp",
     "HexagonEarlyIfConv.cpp",
     "HexagonExpandCondsets.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
index a8d6290f1b99..2ece91331c5d 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
@@ -37,6 +37,7 @@ static_library("LLVMWebAssemblyCodeGen") {
     "WebAssemblyAsmPrinter.cpp",
     "WebAssemblyCFGSort.cpp",
     "WebAssemblyCFGStackify.cpp",
+    "WebAssemblyCleanCodeAfterTrap.cpp",
     "WebAssemblyDebugFixup.cpp",
     "WebAssemblyDebugValueManager.cpp",
     "WebAssemblyExceptionInfo.cpp",
diff --git a/llvm/utils/release/test-release.sh b/llvm/utils/release/test-release.sh
index 4314b565e11b..050004aa08c4 100755
--- a/llvm/utils/release/test-release.sh
+++ b/llvm/utils/release/test-release.sh
@@ -353,8 +353,7 @@ function build_with_cmake_cache() {
   env CC="$c_compiler" CXX="$cxx_compiler" \
   cmake -G "$generator" -B $CMakeBuildDir -S $SrcDir/llvm \
         -C $SrcDir/clang/cmake/caches/Release.cmake \
-	-DCLANG_BOOTSTRAP_PASSTHROUGH="CMAKE_POSITION_INDEPENDENT_CODE;LLVM_LIT_ARGS" \
-        -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+	-DCLANG_BOOTSTRAP_PASSTHROUGH="LLVM_LIT_ARGS" \
         -DLLVM_LIT_ARGS="-j $NumJobs $LitVerbose" \
         $ExtraConfigureFlags
         2>&1 | tee $LogDir/llvm.configure-$Flavor.log
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
index a8235bed6f27..4a9ddafdd177 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
@@ -88,8 +88,6 @@ def GpuModuleToBinaryPass
     4. `fatbinary`, `fatbin`: produces fatbinaries.
   }];
   let options = [
-    Option<"offloadingHandler", "handler", "Attribute", "nullptr",
-           "Offloading handler to be attached to the resulting binary op.">,
     Option<"toolkitPath", "toolkit", "std::string", [{""}],
            "Toolkit path.">,
     ListOption<"linkFiles", "l", "std::string",
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index a52cca3c95de..759cbe6c1564 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -1060,8 +1060,8 @@ def LLVM_vector_extract
   }];
 }
 
-def LLVM_experimental_vector_interleave2
-    : LLVM_OneResultIntrOp<"experimental.vector.interleave2",
+def LLVM_vector_interleave2
+    : LLVM_OneResultIntrOp<"vector.interleave2",
         /*overloadedResults=*/[0], /*overloadedOperands=*/[],
         /*traits=*/[
           Pure, AllTypesMatch<["vec1", "vec2"]>,
diff --git a/mlir/include/mlir/Dialect/LLVMIR/VCIXOps.td b/mlir/include/mlir/Dialect/LLVMIR/VCIXOps.td
index 25c1d027768a..27d9a32dd8e0 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/VCIXOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/VCIXOps.td
@@ -78,7 +78,7 @@ def VCIX_BinaryImmOp : VCIX_Op<"v.iv">,
           xlen, $opcode, $_location, moduleTranslation);
       llvm::Value *immConst = mlir::LLVM::detail::getLLVMConstant(
           xlen, $imm, $_location, moduleTranslation);
-      VectorType vt = op.getResult().getType().cast<VectorType>();
+      VectorType vt = mlir::cast<VectorType>(op.getResult().getType());
       llvm::Value *vl =
           createVL(builder, $vl, vt, xlen, $_location, moduleTranslation);
       $res = createIntrinsicCall(
@@ -120,7 +120,7 @@ def VCIX_BinaryOp : VCIX_Op<"v.sv">,
       } else {
         id = llvm::Intrinsic::riscv_sf_vc_v_fv_se;
       }
-      VectorType vt = op.getResult().getType().cast<VectorType>();
+      VectorType vt = mlir::cast<VectorType>(op.getResult().getType());
       llvm::Value *vl =
           createVL(builder, $vl, vt, xlen, $_location, moduleTranslation);
       $res = createIntrinsicCall(
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgEnums.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgEnums.td
index 59f909aed8f6..6b4b073fc672 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgEnums.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgEnums.td
@@ -22,7 +22,14 @@ def UnaryFn : I32EnumAttr<"UnaryFn", "", [
   I32EnumAttrCase<"abs", 2>,
   I32EnumAttrCase<"ceil", 3>,
   I32EnumAttrCase<"floor", 4>,
-  I32EnumAttrCase<"negf", 5>
+  I32EnumAttrCase<"negf", 5>,
+  I32EnumAttrCase<"reciprocal", 6>,
+  I32EnumAttrCase<"round", 7>,
+  I32EnumAttrCase<"sqrt", 8>,
+  I32EnumAttrCase<"rsqrt", 9>,
+  I32EnumAttrCase<"square", 10>,
+  I32EnumAttrCase<"tanh", 11>,
+  I32EnumAttrCase<"erf", 12>
 ]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::linalg";
@@ -36,7 +43,8 @@ def BinaryFn : I32EnumAttr<"BinaryFn", "", [
   I32EnumAttrCase<"max_signed", 5>,
   I32EnumAttrCase<"min_signed", 6>,
   I32EnumAttrCase<"max_unsigned", 7>,
-  I32EnumAttrCase<"min_unsigned", 8>
+  I32EnumAttrCase<"min_unsigned", 8>,
+  I32EnumAttrCase<"powf", 9>
 ]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::linalg";
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
index 1ff6c4086cf3..584bfcd8b59d 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
@@ -305,6 +305,251 @@ structured_op: !LinalgStructuredOpConfig
           scalar_arg: I
 --- !LinalgOpConfig
 metadata: !LinalgOpMetadata
+  name: reciprocal
+  cpp_class_name: ReciprocalOp
+  doc: |-
+    Applies reciprocal(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+  iterator_types: []
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: unary
+        fn_name: reciprocal
+        operands:
+        - !ScalarExpression
+          scalar_arg: I
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: round
+  cpp_class_name: RoundOp
+  doc: |-
+    Applies round(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+  iterator_types: []
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: unary
+        fn_name: round
+        operands:
+        - !ScalarExpression
+          scalar_arg: I
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: sqrt
+  cpp_class_name: SqrtOp
+  doc: |-
+    Applies sqrt(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+  iterator_types: []
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: unary
+        fn_name: sqrt
+        operands:
+        - !ScalarExpression
+          scalar_arg: I
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: rsqrt
+  cpp_class_name: RsqrtOp
+  doc: |-
+    Applies rsqrt(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+  iterator_types: []
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: unary
+        fn_name: rsqrt
+        operands:
+        - !ScalarExpression
+          scalar_arg: I
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: square
+  cpp_class_name: SquareOp
+  doc: |-
+    Applies square(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+  iterator_types: []
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: unary
+        fn_name: square
+        operands:
+        - !ScalarExpression
+          scalar_arg: I
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: tanh
+  cpp_class_name: TanhOp
+  doc: |-
+    Applies tanh(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+  iterator_types: []
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: unary
+        fn_name: tanh
+        operands:
+        - !ScalarExpression
+          scalar_arg: I
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: erf
+  cpp_class_name: erfOp
+  doc: |-
+    Applies erf(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+  iterator_types: []
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: unary
+        fn_name: erf
+        operands:
+        - !ScalarExpression
+          scalar_arg: I
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
   name: elemwise_binary
   cpp_class_name: ElemwiseBinaryOp
   doc: |-
@@ -625,7 +870,7 @@ metadata: !LinalgOpMetadata
 
     This means reduction/broadcast/element cast semantics is explicit. Further
     passes can take that into account when lowering this code. For example,
-    a `linalg.broadcast` + `linalg.div` sequence can be lowered to a
+    a `linalg.broadcast` + `linalg.max` sequence can be lowered to a
     `linalg.generic` with different affine maps for the two operands.
 structured_op: !LinalgStructuredOpConfig
   args:
@@ -664,6 +909,106 @@ structured_op: !LinalgStructuredOpConfig
           scalar_arg: rhs
 --- !LinalgOpConfig
 metadata: !LinalgOpMetadata
+  name: min
+  cpp_class_name: MinOp
+  doc: |-
+    Takes the min (signed) between two inputs, elementwise.
+
+    The shapes and element types must be identical. The appropriate casts,
+    broadcasts and reductions should be done previously to calling this op.
+
+    This means reduction/broadcast/element cast semantics is explicit. Further
+    passes can take that into account when lowering this code. For example,
+    a `linalg.broadcast` + `linalg.min` sequence can be lowered to a
+    `linalg.generic` with different affine maps for the two operands.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: lhs
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: rhs
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+  iterator_types: []
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: min_signed
+        operands:
+        - !ScalarExpression
+          scalar_arg: lhs
+        - !ScalarExpression
+          scalar_arg: rhs
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: powf
+  cpp_class_name: PowFOp
+  doc: |-
+    Takes the powf(lhs, rhs) between two inputs, elementwise. For powf(arg, 2) use `linalg.square`.
+
+    Only applies to floating point values.
+
+    The shapes and element types must be identical. The appropriate casts,
+    broadcasts and reductions should be done previously to calling this op.
+
+    This means reduction/broadcast/element cast semantics is explicit. Further
+    passes can take that into account when lowering this code. For example,
+    a `linalg.broadcast` + `linalg.powf` sequence can be lowered to a
+    `linalg.generic` with different affine maps for the two operands.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: lhs
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: rhs
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: T1
+    shape_map: affine_map<() -> ()>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+    - affine_map<() -> ()>
+  iterator_types: []
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: powf
+        operands:
+        - !ScalarExpression
+          scalar_arg: lhs
+        - !ScalarExpression
+          scalar_arg: rhs
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
   name: matmul
   cpp_class_name: MatmulOp
   doc: |-
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
index da12e7c83b22..64c538367267 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
@@ -138,10 +138,10 @@ def Linalg_SoftmaxOp : Linalg_Op<"softmax",
 
   let extraClassDeclaration = [{
     ShapedType getInputOperandType() {
-      return getInput().getType().cast<ShapedType>();
+      return cast<ShapedType>(getInput().getType());
     }
     ShapedType getOutputOperandType() {
-      return getOutput().getType().cast<ShapedType>();
+      return cast<ShapedType>(getOutput().getType());
     }
     int64_t getInputOperandRank() {
       return getInputOperandType().getRank();
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
index ab9b78e755d9..d9569d9d294d 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
@@ -234,8 +234,8 @@ def OffloadModuleInterface : OpInterface<"OffloadModuleInterface"> {
       /*methodName=*/"getIsTargetDevice",
       (ins), [{}], [{
         if (Attribute isTargetDevice = $_op->getAttr("omp.is_target_device"))
-          if (isTargetDevice.isa<mlir::BoolAttr>())
-           return isTargetDevice.dyn_cast<BoolAttr>().getValue();
+          if (::llvm::isa<mlir::BoolAttr>(isTargetDevice))
+           return ::llvm::dyn_cast<BoolAttr>(isTargetDevice).getValue();
         return false;
       }]>,
     InterfaceMethod<
@@ -259,7 +259,7 @@ def OffloadModuleInterface : OpInterface<"OffloadModuleInterface"> {
       /*methodName=*/"getIsGPU",
       (ins), [{}], [{
         if (Attribute isTargetCGAttr = $_op->getAttr("omp.is_gpu"))
-          if (auto isTargetCGVal = isTargetCGAttr.dyn_cast<BoolAttr>())
+          if (auto isTargetCGVal = ::llvm::dyn_cast<BoolAttr>(isTargetCGAttr))
            return isTargetCGVal.getValue();
         return false;
       }]>,
@@ -332,7 +332,7 @@ def OffloadModuleInterface : OpInterface<"OffloadModuleInterface"> {
       /*methodName=*/"getRequires",
       (ins), [{}], [{
         if (Attribute requiresAttr = $_op->getAttr("omp.requires"))
-          if (auto requiresVal = requiresAttr.dyn_cast<mlir::omp::ClauseRequiresAttr>())
+          if (auto requiresVal = ::llvm::dyn_cast<mlir::omp::ClauseRequiresAttr>(requiresAttr))
             return requiresVal.getValue();
         return mlir::omp::ClauseRequires::none;
       }]>,
diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.h b/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.h
index 39b05b9d3ad1..3325a6fa3f9f 100644
--- a/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.h
+++ b/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.h
@@ -51,9 +51,6 @@ public:
     return (exponent.ult(other.exponent));
   }
 
-  // Prints polynomial to 'os'.
-  void print(raw_ostream &os) const;
-
   friend ::llvm::hash_code hash_value(const Monomial &arg);
 
 public:
@@ -102,6 +99,8 @@ public:
 
   unsigned getDegree() const;
 
+  ArrayRef<Monomial> getTerms() const { return terms; }
+
   friend ::llvm::hash_code hash_value(const Polynomial &arg);
 
 private:
diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td b/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td
index 5d8da8399b01..d3e3ac55677f 100644
--- a/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td
+++ b/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td
@@ -35,18 +35,18 @@ def Polynomial_Dialect : Dialect {
 
     ```mlir
     // A constant polynomial in a ring with i32 coefficients and no polynomial modulus
-    #ring = #polynomial.ring<ctype=i32>
+    #ring = #polynomial.ring<coefficientType=i32>
     %a = polynomial.constant <1 + x**2 - 3x**3> : polynomial.polynomial<#ring>
 
     // A constant polynomial in a ring with i32 coefficients, modulo (x^1024 + 1)
     #modulus = #polynomial.polynomial<1 + x**1024>
-    #ring = #polynomial.ring<ctype=i32, ideal=#modulus>
+    #ring = #polynomial.ring<coefficientType=i32, polynomialModulus=#modulus>
     %a = polynomial.constant <1 + x**2 - 3x**3> : polynomial.polynomial<#ring>
 
     // A constant polynomial in a ring with i32 coefficients, with a polynomial
     // modulus of (x^1024 + 1) and a coefficient modulus of 17.
     #modulus = #polynomial.polynomial<1 + x**1024>
-    #ring = #polynomial.ring<ctype=i32, cmod=17, ideal=#modulus>
+    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=17, polynomialModulus=#modulus>
     %a = polynomial.constant <1 + x**2 - 3x**3> : polynomial.polynomial<#ring>
     ```
   }];
@@ -63,7 +63,21 @@ class Polynomial_Attr<string name, string attrMnemonic, list<Trait> traits = []>
 def Polynomial_PolynomialAttr : Polynomial_Attr<"Polynomial", "polynomial"> {
   let summary = "An attribute containing a single-variable polynomial.";
   let description = [{
-     #poly = #polynomial.poly<x**1024 + 1>
+    A polynomial attribute represents a single-variable polynomial, which
+    is used to define the modulus of a `RingAttr`, as well as to define constants
+    and perform constant folding for `polynomial` ops.
+
+    The polynomial must be expressed as a list of monomial terms, with addition
+    or subtraction between them. The choice of variable name is arbitrary, but
+    must be consistent across all the monomials used to define a single
+    attribute. The order of monomial terms is arbitrary, each monomial degree
+    must occur at most once.
+
+    Example:
+
+    ```mlir
+    #poly = #polynomial.polynomial<x**1024 + 1>
+    ```
   }];
   let parameters = (ins "Polynomial":$polynomial);
   let hasCustomAssemblyFormat = 1;
@@ -79,10 +93,10 @@ def Polynomial_RingAttr : Polynomial_Attr<"Ring", "ring"> {
     integral, whose coefficients are taken modulo some statically known modulus
     (`coefficientModulus`).
 
-    Additionally, a polynomial ring can specify an _ideal_, which converts
+    Additionally, a polynomial ring can specify a _polynomialModulus_, which converts
     polynomial arithmetic to the analogue of modular integer arithmetic, where
     each polynomial is represented as its remainder when dividing by the
-    modulus. For single-variable polynomials, an "ideal" is always specificed
+    modulus. For single-variable polynomials, an "polynomialModulus" is always specificed
     via a single polynomial, which we call `polynomialModulus`.
 
     An expressive example is polynomials with i32 coefficients, whose
@@ -122,32 +136,284 @@ class Polynomial_Type<string name, string typeMnemonic>
 
 def Polynomial_PolynomialType : Polynomial_Type<"Polynomial", "polynomial"> {
   let summary = "An element of a polynomial ring.";
-
   let description = [{
     A type for polynomials in a polynomial quotient ring.
   }];
-
   let parameters = (ins Polynomial_RingAttr:$ring);
   let assemblyFormat = "`<` $ring `>`";
 }
 
+def PolynomialLike: TypeOrContainer<Polynomial_PolynomialType, "polynomial-like">;
+
 class Polynomial_Op<string mnemonic, list<Trait> traits = []> :
-    Op<Polynomial_Dialect, mnemonic, traits # [Pure]>;
+    Op<Polynomial_Dialect, mnemonic, traits # [Pure]> {
+  let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
+}
 
 class Polynomial_UnaryOp<string mnemonic, list<Trait> traits = []> :
     Polynomial_Op<mnemonic, traits # [SameOperandsAndResultType]> {
   let arguments = (ins Polynomial_PolynomialType:$operand);
   let results = (outs Polynomial_PolynomialType:$result);
-
-  let assemblyFormat = "$operand attr-dict `:` qualified(type($result))";
 }
 
 class Polynomial_BinaryOp<string mnemonic, list<Trait> traits = []> :
-    Polynomial_Op<mnemonic, traits # [SameOperandsAndResultType]> {
-  let arguments = (ins Polynomial_PolynomialType:$lhs, Polynomial_PolynomialType:$rhs);
-  let results = (outs Polynomial_PolynomialType:$result);
+    Polynomial_Op<mnemonic, !listconcat(traits, [Pure, SameOperandsAndResultType, ElementwiseMappable])> {
+  let arguments = (ins PolynomialLike:$lhs, PolynomialLike:$rhs);
+  let results = (outs PolynomialLike:$result);
+  let assemblyFormat = "operands attr-dict `:` type($result)";
+}
+
+def Polynomial_AddOp : Polynomial_BinaryOp<"add", [Commutative]> {
+  let summary = "Addition operation between polynomials.";
+  let description = [{
+    Performs polynomial addition on the operands. The operands may be single
+    polynomials or containers of identically-typed polynomials, i.e., polynomials
+    from the same underlying ring with the same coefficient types.
+
+    Addition is defined to occur in the ring defined by the ring attribute of
+    the two operands, meaning the addition is taken modulo the coefficientModulus
+    and the polynomialModulus of the ring.
+
+    Example:
+
+    ```mlir
+    // add two polynomials modulo x^1024 - 1
+    #poly = #polynomial.polynomial<x**1024 - 1>
+    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536, polynomialModulus=#poly>
+    %0 = polynomial.constant #polynomial.polynomial<1 + x**2> : !polynomial.polynomial<#ring>
+    %1 = polynomial.constant #polynomial.polynomial<x**5 - x + 1> : !polynomial.polynomial<#ring>
+    %2 = polynomial.add %0, %1 : !polynomial.polynomial<#ring>
+    ```
+  }];
+}
+
+def Polynomial_SubOp : Polynomial_BinaryOp<"sub"> {
+  let summary = "Subtraction operation between polynomials.";
+  let description = [{
+    Performs polynomial subtraction on the operands. The operands may be single
+    polynomials or containers of identically-typed polynomials, i.e., polynomials
+    from the same underlying ring with the same coefficient types.
+
+    Subtraction is defined to occur in the ring defined by the ring attribute of
+    the two operands, meaning the subtraction is taken modulo the coefficientModulus
+    and the polynomialModulus of the ring.
+
+    Example:
 
-  let assemblyFormat = "$lhs `,` $rhs attr-dict `:` qualified(type($result))";
+    ```mlir
+    // subtract two polynomials modulo x^1024 - 1
+    #poly = #polynomial.polynomial<x**1024 - 1>
+    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536, polynomialModulus=#poly>
+    %0 = polynomial.constant #polynomial.polynomial<1 + x**2> : !polynomial.polynomial<#ring>
+    %1 = polynomial.constant #polynomial.polynomial<x**5 - x + 1> : !polynomial.polynomial<#ring>
+    %2 = polynomial.sub %0, %1 : !polynomial.polynomial<#ring>
+    ```
+  }];
+}
+
+def Polynomial_MulOp : Polynomial_BinaryOp<"mul", [Commutative]> {
+  let summary = "Multiplication operation between polynomials.";
+  let description = [{
+    Performs polynomial multiplication on the operands. The operands may be single
+    polynomials or containers of identically-typed polynomials, i.e., polynomials
+    from the same underlying ring with the same coefficient types.
+
+    Multiplication is defined to occur in the ring defined by the ring attribute of
+    the two operands, meaning the multiplication is taken modulo the coefficientModulus
+    and the polynomialModulus of the ring.
+
+    Example:
+
+    ```mlir
+    // multiply two polynomials modulo x^1024 - 1
+    #poly = #polynomial.polynomial<x**1024 - 1>
+    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536, polynomialModulus=#poly>
+    %0 = polynomial.constant #polynomial.polynomial<1 + x**2> : !polynomial.polynomial<#ring>
+    %1 = polynomial.constant #polynomial.polynomial<x**5 - x + 1> : !polynomial.polynomial<#ring>
+    %2 = polynomial.mul %0, %1 : !polynomial.polynomial<#ring>
+    ```
+  }];
+}
+
+def Polynomial_MulScalarOp : Polynomial_Op<"mul_scalar", [
+      ElementwiseMappable, AllTypesMatch<["polynomial", "output"]>]> {
+  let summary = "Multiplication by a scalar of the field.";
+  let description = [{
+    Multiplies the polynomial operand's coefficients by a given scalar value.
+    The operation is defined to occur in the ring defined by the ring attribute
+    of the two operands, meaning the multiplication is taken modulo the
+    coefficientModulus of the ring.
+
+    The `scalar` input must have the same type as the polynomial ring's
+    coefficientType.
+
+    Example:
+
+    ```mlir
+    // multiply two polynomials modulo x^1024 - 1
+    #poly = #polynomial.polynomial<x**1024 - 1>
+    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536, polynomialModulus=#poly>
+    %0 = polynomial.constant #polynomial.polynomial<1 + x**2> : !polynomial.polynomial<#ring>
+    %1 = arith.constant 3 : i32
+    %2 = polynomial.mul_scalar %0, %1 : !polynomial.polynomial<#ring>, i32
+    ```
+  }];
+
+  let arguments = (ins
+    PolynomialLike:$polynomial,
+    AnyInteger:$scalar
+  );
+  let results = (outs
+    PolynomialLike:$output
+  );
+  let assemblyFormat = "operands attr-dict `:` type($polynomial) `,` type($scalar)";
+  let hasVerifier = 1;
+}
+
+def Polynomial_LeadingTermOp: Polynomial_Op<"leading_term"> {
+  let summary = "Compute the leading term of the polynomial.";
+  let description = [{
+    The degree of a polynomial is the largest $k$ for which the coefficient
+    `a_k` of `x^k` is nonzero. The leading term is the term `a_k * x^k`, which
+    this op represents as a pair of results. The first is the degree `k` as an
+    index, and the second is the coefficient, whose type matches the
+    coefficient type of the polynomial's ring attribute.
+
+    Example:
+
+    ```mlir
+    #poly = #polynomial.polynomial<x**1024 - 1>
+    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536, polynomialModulus=#poly>
+    %0 = polynomial.constant #polynomial.polynomial<1 + x**2> : !polynomial.polynomial<#ring>
+    %1, %2 = polynomial.leading_term %0 : !polynomial.polynomial<#ring> -> (index, i32)
+    ```
+  }];
+  let arguments = (ins Polynomial_PolynomialType:$input);
+  let results = (outs Index:$degree, AnyInteger:$coefficient);
+  let assemblyFormat = "operands attr-dict `:` type($input) `->` `(` type($degree) `,` type($coefficient) `)`";
+}
+
+def Polynomial_MonomialOp: Polynomial_Op<"monomial"> {
+  let summary = "Create a polynomial that consists of a single monomial.";
+  let description = [{
+    Construct a polynomial that consists of a single monomial term, from its
+    degree and coefficient as dynamic inputs.
+
+    The coefficient type of the output polynomial's ring attribute must match
+    the `coefficient` input type.
+
+    Example:
+
+    ```mlir
+    #poly = #polynomial.polynomial<x**1024 - 1>
+    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536, polynomialModulus=#poly>
+    %deg = arith.constant 1023 : index
+    %five = arith.constant 5 : i32
+    %0 = polynomial.monomial %five, %deg : (i32, index) -> !polynomial.polynomial<#ring>
+    ```
+  }];
+  let arguments = (ins AnyInteger:$coefficient, Index:$degree);
+  let results = (outs Polynomial_PolynomialType:$output);
+}
+
+def Polynomial_MonicMonomialMulOp: Polynomial_Op<"monic_monomial_mul", [AllTypesMatch<["input", "output"]>]> {
+  let summary = "Multiply a polynomial by a monic monomial.";
+  let description = [{
+    Multiply a polynomial by a monic monomial, meaning a polynomial of the form
+    `1 * x^k` for an index operand `k`.
+
+    In some special rings of polynomials, such as a ring of polynomials
+    modulo `x^n - 1`, `monomial_mul` can be interpreted as a cyclic shift of
+    the coefficients of the polynomial. For some rings, this results in
+    optimized lowerings that involve rotations and rescaling of the
+    coefficients of the input.
+  }];
+  let arguments = (ins PolynomialLike:$input, Index:$monomialDegree);
+  let results = (outs PolynomialLike:$output);
+}
+
+def Polynomial_FromTensorOp : Polynomial_Op<"from_tensor", [Pure]> {
+  let summary = "Creates a polynomial from integer coefficients stored in a tensor.";
+  let description = [{
+    `polynomial.from_tensor` creates a polynomial value from a tensor of coefficients.
+    The input tensor must list the coefficients in degree-increasing order.
+
+    The input one-dimensional tensor may have size at most the degree of the
+    ring's polynomialModulus generator polynomial, with smaller dimension implying that
+    all higher-degree terms have coefficient zero.
+
+    Example:
+
+    ```mlir
+    #poly = #polynomial.polynomial<x**1024 - 1>
+    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536, polynomialModulus=#poly>
+    %two = arith.constant 2 : i32
+    %five = arith.constant 5 : i32
+    %coeffs = tensor.from_elements %two, %two, %five : tensor<3xi32>
+    %poly = polynomial.from_tensor %coeffs : tensor<3xi32> -> !polynomial.polynomial<#ring>
+    ```
+  }];
+  let arguments = (ins RankedTensorOf<[AnyInteger]>:$input);
+  let results = (outs Polynomial_PolynomialType:$output);
+
+  let assemblyFormat = "$input attr-dict `:` type($input) `->` type($output)";
+
+  let builders = [
+    // Builder that infers coefficient modulus from tensor bit width,
+    // and uses whatever input ring is provided by the caller.
+    OpBuilder<(ins "::mlir::Value":$input, "::mlir::polynomial::RingAttr":$ring)>
+  ];
+  let hasVerifier = 1;
+}
+
+def Polynomial_ToTensorOp : Polynomial_Op<"to_tensor", [Pure]> {
+  let summary = "Creates a tensor containing the coefficients of a polynomial.";
+  let description = [{
+    `polynomial.to_tensor` creates a dense tensor value containing the
+    coefficients of the input polynomial. The output tensor contains the
+    coefficients in degree-increasing order.
+
+    Operations that act on the coefficients of a polynomial, such as extracting
+    a specific coefficient or extracting a range of coefficients, should be
+    implemented by composing `to_tensor` with the relevant `tensor` dialect
+    ops.
+
+    The output tensor has shape equal to the degree of the polynomial ring
+    attribute's polynomialModulus, including zeroes.
+
+    Example:
+
+    ```mlir
+    #poly = #polynomial.polynomial<x**1024 - 1>
+    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536, polynomialModulus=#poly>
+    %two = arith.constant 2 : i32
+    %five = arith.constant 5 : i32
+    %coeffs = tensor.from_elements %two, %two, %five : tensor<3xi32>
+    %poly = polynomial.from_tensor %coeffs : tensor<3xi32> -> !polynomial.polynomial<#ring>
+    %tensor = polynomial.to_tensor %poly : !polynomial.polynomial<#ring> -> tensor<1024xi32>
+    ```
+  }];
+  let arguments = (ins Polynomial_PolynomialType:$input);
+  let results = (outs RankedTensorOf<[AnyInteger]>:$output);
+  let assemblyFormat = "$input attr-dict `:` type($input) `->` type($output)";
+
+  let hasVerifier = 1;
+}
+
+def Polynomial_ConstantOp : Polynomial_Op<"constant", [Pure]> {
+  let summary = "Define a constant polynomial via an attribute.";
+  let description = [{
+    Example:
+
+    ```mlir
+    #poly = #polynomial.polynomial<x**1024 - 1>
+    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536, polynomialModulus=#poly>
+    %0 = polynomial.constant #polynomial.polynomial<1 + x**2> : !polynomial.polynomial<#ring>
+    ```
+  }];
+  let arguments = (ins Polynomial_PolynomialAttr:$input);
+  let results = (outs Polynomial_PolynomialType:$output);
+  let assemblyFormat = "$input attr-dict `:` type($output)";
 }
 
 #endif // POLYNOMIAL_OPS
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
index 5e523ec428ae..b182b4c72b95 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
@@ -89,18 +89,21 @@ inline MemRefType getMemRefType(T &&t) {
 /// Returns null-attribute for any type without an encoding.
 SparseTensorEncodingAttr getSparseTensorEncoding(Type type);
 
+/// Returns true iff the type range has any sparse tensor type.
+inline bool hasAnySparseType(TypeRange types) {
+  return llvm::any_of(types, [](Type type) {
+    return getSparseTensorEncoding(type) != nullptr;
+  });
+}
+
 /// Returns true iff MLIR operand has any sparse operand.
 inline bool hasAnySparseOperand(Operation *op) {
-  return llvm::any_of(op->getOperands().getTypes(), [](Type t) {
-    return getSparseTensorEncoding(t) != nullptr;
-  });
+  return hasAnySparseType(op->getOperands().getTypes());
 }
 
 /// Returns true iff MLIR operand has any sparse result.
 inline bool hasAnySparseResult(Operation *op) {
-  return llvm::any_of(op->getResults().getTypes(), [](Type t) {
-    return getSparseTensorEncoding(t) != nullptr;
-  });
+  return hasAnySparseType(op->getResults().getTypes());
 }
 
 /// Returns true iff MLIR operand has any sparse operand or result.
diff --git a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
index 8a57c6094c41..030be328e97f 100644
--- a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
+++ b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
@@ -194,7 +194,7 @@ bool isLinearizableVector(VectorType type);
 /// for each dimension of the passed in tensor.
 Value createReadOrMaskedRead(OpBuilder &builder, Location loc, Value source,
                              ArrayRef<int64_t> readShape, Value padValue,
-                             bool useInBoundsInsteadOfMasking = true);
+                             bool useInBoundsInsteadOfMasking);
 
 /// Returns success if `inputVectorSizes` is a valid masking configuraion for
 /// given `shape`, i.e., it meets:
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 88f2e1acfeeb..e477d9a0ca3f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -164,10 +164,10 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     /// source operand. They overide static shape from source memref type.
     ArrayRef<int64_t> getStaticSizes() {
       auto attr = getConstShapeAttr();
-      if (getSourceType().isa<IntegerType>() || attr)
+      if (llvm::isa<IntegerType>(getSourceType()) || attr)
         return attr;
 
-      auto memrefType = getSourceType().dyn_cast<MemRefType>();
+      auto memrefType = llvm::dyn_cast<MemRefType>(getSourceType());
       assert(memrefType && "Incorrect use of getStaticSizes");
       return memrefType.getShape();
     }
@@ -179,10 +179,10 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     /// source operand. They overide static strides from source memref type.
     ArrayRef<int64_t> getStaticStrides() {
       auto attr = getConstStridesAttr();
-      if (getSourceType().isa<IntegerType>() || attr)
+      if (llvm::isa<IntegerType>(getSourceType()) || attr)
         return attr;
 
-      auto memrefType = getSourceType().dyn_cast<MemRefType>();
+      auto memrefType = llvm::dyn_cast<MemRefType>(getSourceType());
       assert(memrefType && "Incorrect use of getStaticStrides");
       auto [strides, offset] = getStridesAndOffset(memrefType);
       // reuse the storage of ConstStridesAttr since strides from
@@ -196,7 +196,7 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     /// `static_shape` and `static_strides` attributes.
     std::array<unsigned, 3> getArrayAttrMaxRanks() {
       unsigned rank;
-      if (auto ty = getSourceType().dyn_cast<MemRefType>()) {
+      if (auto ty = llvm::dyn_cast<MemRefType>(getSourceType())) {
         rank = ty.getRank();
       } else {
         rank = (unsigned)getMixedOffsets().size();
diff --git a/mlir/include/mlir/IR/BuiltinLocationAttributes.td b/mlir/include/mlir/IR/BuiltinLocationAttributes.td
index dfcc180071f7..5a72404dea15 100644
--- a/mlir/include/mlir/IR/BuiltinLocationAttributes.td
+++ b/mlir/include/mlir/IR/BuiltinLocationAttributes.td
@@ -228,7 +228,8 @@ def OpaqueLoc : Builtin_LocationAttr<"OpaqueLoc"> {
     template <typename T> static T getUnderlyingLocation(Location location) {
       assert(isa<T>(location));
       return reinterpret_cast<T>(
-          location.cast<mlir::OpaqueLoc>().getUnderlyingLocation());
+          mlir::cast<mlir::OpaqueLoc>(static_cast<LocationAttr>(location))
+              .getUnderlyingLocation());
     }
 
     /// Returns a pointer to some data structure that opaque location stores.
@@ -237,15 +238,17 @@ def OpaqueLoc : Builtin_LocationAttr<"OpaqueLoc"> {
     template <typename T>
     static T getUnderlyingLocationOrNull(Location location) {
       return isa<T>(location)
-                 ? reinterpret_cast<T>(
-                       location.cast<mlir::OpaqueLoc>().getUnderlyingLocation())
-                 : T(nullptr);
+                ? reinterpret_cast<T>(mlir::cast<mlir::OpaqueLoc>(
+                                          static_cast<LocationAttr>(location))
+                                          .getUnderlyingLocation())
+                : T(nullptr);
     }
 
     /// Checks whether provided location is opaque location and contains a
     /// pointer to an object of particular type.
     template <typename T> static bool isa(Location location) {
-      auto opaque_loc = location.dyn_cast<OpaqueLoc>();
+      auto opaque_loc =
+          mlir::dyn_cast<OpaqueLoc>(static_cast<LocationAttr>(location));
       return opaque_loc && opaque_loc.getUnderlyingTypeID() == TypeID::get<T>();
     }
   }];
diff --git a/mlir/include/mlir/IR/Value.h b/mlir/include/mlir/IR/Value.h
index cdbc6cc37436..a7344c64e673 100644
--- a/mlir/include/mlir/IR/Value.h
+++ b/mlir/include/mlir/IR/Value.h
@@ -98,25 +98,25 @@ public:
   constexpr Value(detail::ValueImpl *impl = nullptr) : impl(impl) {}
 
   template <typename U>
-  [[deprecated("Use isa<U>() instead")]]
+  [[deprecated("Use mlir::isa<U>() instead")]]
   bool isa() const {
     return llvm::isa<U>(*this);
   }
 
   template <typename U>
-  [[deprecated("Use dyn_cast<U>() instead")]]
+  [[deprecated("Use mlir::dyn_cast<U>() instead")]]
   U dyn_cast() const {
     return llvm::dyn_cast<U>(*this);
   }
 
   template <typename U>
-  [[deprecated("Use dyn_cast_or_null<U>() instead")]]
+  [[deprecated("Use mlir::dyn_cast_or_null<U>() instead")]]
   U dyn_cast_or_null() const {
     return llvm::dyn_cast_or_null<U>(*this);
   }
 
   template <typename U>
-  [[deprecated("Use cast<U>() instead")]]
+  [[deprecated("Use mlir::cast<U>() instead")]]
   U cast() const {
     return llvm::cast<U>(*this);
   }
diff --git a/mlir/include/mlir/Tools/lsp-server-support/Transport.h b/mlir/include/mlir/Tools/lsp-server-support/Transport.h
index b973a2e26725..44c71058cf71 100644
--- a/mlir/include/mlir/Tools/lsp-server-support/Transport.h
+++ b/mlir/include/mlir/Tools/lsp-server-support/Transport.h
@@ -95,10 +95,10 @@ private:
 template <typename T>
 using Callback = llvm::unique_function<void(llvm::Expected<T>)>;
 
-/// An OutgoingMessage<T> is a function used for outgoing requests or
-/// notifications to send to the client.
+/// An OutgoingNotification<T> is a function used for outgoing notifications
+/// send to the client.
 template <typename T>
-using OutgoingMessage = llvm::unique_function<void(const T &)>;
+using OutgoingNotification = llvm::unique_function<void(const T &)>;
 
 /// A handler used to process the incoming transport messages.
 class MessageHandler {
@@ -160,10 +160,9 @@ public:
     };
   }
 
-  /// Create an OutgoingMessage function that, when called, sends a notification
-  /// with the given method via the transport.
+  /// Create an OutgoingNotification object used for the given method.
   template <typename T>
-  OutgoingMessage<T> outgoingNotification(llvm::StringLiteral method) {
+  OutgoingNotification<T> outgoingNotification(llvm::StringLiteral method) {
     return [&, method](const T &params) {
       std::lock_guard<std::mutex> transportLock(transportOutputMutex);
       Logger::info("--> {0}", method);
diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
index 4a15976d40c7..c2a83f90bcbe 100644
--- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
+++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
@@ -857,7 +857,7 @@ struct SqrtOpConversion : public OpConversionPattern<complex::SqrtOp> {
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
 
     auto type = cast<ComplexType>(op.getType());
-    auto elementType = type.getElementType().cast<FloatType>();
+    auto elementType = cast<FloatType>(type.getElementType());
     arith::FastMathFlags fmf = op.getFastMathFlagsAttr().getValue();
 
     auto cst = [&](APFloat v) {
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 1b9975237c69..fe6bcc1c8b66 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1738,7 +1738,7 @@ struct VectorInterleaveOpLowering
                                          "InterleaveOp not rank 1");
     // If the result is rank 1, then this directly maps to LLVM.
     if (resultType.isScalable()) {
-      rewriter.replaceOpWithNewOp<LLVM::experimental_vector_interleave2>(
+      rewriter.replaceOpWithNewOp<LLVM::vector_interleave2>(
           interleaveOp, typeConverter->convertType(resultType),
           adaptor.getLhs(), adaptor.getRhs());
       return success();
diff --git a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
index caca2ff81964..02d05780a7ac 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
+++ b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
@@ -175,6 +175,7 @@ def MulSIExtendedToMulI :
 def IsScalarOrSplatOne :
     Constraint<And<[
       CPred<"succeeded(getIntOrSplatIntValue($0))">,
+      CPred<"getIntOrSplatIntValue($0)->isStrictlyPositive()">,
       CPred<"*getIntOrSplatIntValue($0) == 1">]>>;
 
 // mulsi_extended(x, 1) -> [x, extsi(cmpi slt, x, 0)]
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp b/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp
index d3751d4ba7e7..39292c4533d6 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp
@@ -86,8 +86,7 @@ static Value createInterleave2Intrinsic(RewriterBase &rewriter, Location loc,
   auto inputType = cast<VectorType>(lhs.getType());
   VectorType inputTypeX2 =
       VectorType::Builder(inputType).setDim(0, inputType.getShape()[0] * 2);
-  return rewriter.create<LLVM::experimental_vector_interleave2>(
-      loc, inputTypeX2, lhs, rhs);
+  return rewriter.create<LLVM::vector_interleave2>(loc, inputTypeX2, lhs, rhs);
 }
 
 // Fuse two 'arm_sme.outerproduct' operations that are chained via the
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
index 531016130d1d..2d329a1f3d88 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
@@ -1382,14 +1382,27 @@ LogicalResult
 bufferization::runOneShotBufferize(Operation *op,
                                    const OneShotBufferizationOptions &options,
                                    BufferizationStatistics *statistics) {
+  // copy-before-write deactivates the analysis. It cannot be used together with
+  // test-analysis-only.
   assert(!(options.copyBeforeWrite && options.testAnalysisOnly) &&
          "invalid combination of bufferization flags");
-  if (!options.copyBeforeWrite) {
-    // If a buffer is copied before every write, no analysis is needed.
+
+  if (options.copyBeforeWrite) {
+    // Copy buffer before each write. No analysis is needed.
+  } else {
+    // Run One-Shot Analysis and insert buffer copies (on the tensor level)
+    // only where needed. This is the default and much more efficient than
+    // copy-before-write.
     if (failed(insertTensorCopies(op, options, statistics)))
       return failure();
+
+    // If test-analysis-only is set, the IR was annotated with RaW conflict
+    // markers (attributes) during One-Shot Analysis.
+    if (options.testAnalysisOnly)
+      return success();
   }
-  if (options.testAnalysisOnly)
-    return success();
+
+  // Bufferize the op and its nested ops. If options.copyBeforeWrite is set,
+  // a new buffer copy is allocated every time a buffer is written to.
   return bufferizeOp(op, options, statistics);
 }
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index 2436113dc423..f5e80553ae72 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -241,24 +241,26 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
     map.map(operand.value(), entryBlock.getArgument(operand.index()));
 
   // Clone the region of the gpu.launch operation into the gpu.func operation.
-  // TODO: If cloneInto can be modified such that if a mapping for
-  // a block exists, that block will be used to clone operations into (at the
-  // end of the block), instead of creating a new block, this would be much
-  // cleaner.
   launchOpBody.cloneInto(&outlinedFuncBody, map);
 
-  // Branch from entry of the gpu.func operation to the block that is cloned
-  // from the entry block of the gpu.launch operation.
-  Block &launchOpEntry = launchOpBody.front();
-  Block *clonedLaunchOpEntry = map.lookup(&launchOpEntry);
-  builder.setInsertionPointToEnd(&entryBlock);
-  builder.create<cf::BranchOp>(loc, clonedLaunchOpEntry);
-
-  outlinedFunc.walk([](gpu::TerminatorOp op) {
-    OpBuilder replacer(op);
-    replacer.create<gpu::ReturnOp>(op.getLoc());
-    op.erase();
-  });
+  // Replace the terminator op with returns.
+  for (Block &block : launchOpBody) {
+    Block *clonedBlock = map.lookup(&block);
+    auto terminator = dyn_cast<gpu::TerminatorOp>(clonedBlock->getTerminator());
+    if (!terminator)
+      continue;
+    OpBuilder replacer(terminator);
+    replacer.create<gpu::ReturnOp>(terminator->getLoc());
+    terminator->erase();
+  }
+
+  // Splice now the entry block of the gpu.launch operation at the end of the
+  // gpu.func entry block and erase the redundant block.
+  Block *clonedLaunchOpEntry = map.lookup(&launchOpBody.front());
+  entryBlock.getOperations().splice(entryBlock.getOperations().end(),
+                                    clonedLaunchOpEntry->getOperations());
+  clonedLaunchOpEntry->erase();
+
   return outlinedFunc;
 }
 
diff --git a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
index 01613ab5268b..836e939a8295 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
@@ -88,10 +88,7 @@ void GpuModuleToBinaryPass::runOnOperation() {
   TargetOptions targetOptions(toolkitPath, linkFiles, cmdOptions, *targetFormat,
                               lazyTableBuilder);
   if (failed(transformGpuModulesToBinaries(
-          getOperation(),
-          offloadingHandler ? dyn_cast<OffloadingLLVMTranslationAttrInterface>(
-                                  offloadingHandler.getValue())
-                            : OffloadingLLVMTranslationAttrInterface(nullptr),
+          getOperation(), OffloadingLLVMTranslationAttrInterface(nullptr),
           targetOptions)))
     return signalPassFailure();
 }
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 9c5c58fa1fab..036005ce9d92 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -395,6 +395,24 @@ public:
       return builder.create<math::FloorOp>(arg.getLoc(), arg);
     case UnaryFn::negf:
       return builder.create<arith::NegFOp>(arg.getLoc(), arg);
+    case UnaryFn::reciprocal: {
+      Attribute oneAttr = builder.getOneAttr(arg.getType());
+      auto one = builder.create<arith::ConstantOp>(arg.getLoc(),
+                                                   ::cast<TypedAttr>(oneAttr));
+      return builder.create<arith::DivFOp>(arg.getLoc(), one, arg);
+    }
+    case UnaryFn::round:
+      return builder.create<math::RoundOp>(arg.getLoc(), arg);
+    case UnaryFn::sqrt:
+      return builder.create<math::SqrtOp>(arg.getLoc(), arg);
+    case UnaryFn::rsqrt:
+      return builder.create<math::RsqrtOp>(arg.getLoc(), arg);
+    case UnaryFn::square:
+      return builder.create<arith::MulFOp>(arg.getLoc(), arg, arg);
+    case UnaryFn::tanh:
+      return builder.create<math::TanhOp>(arg.getLoc(), arg);
+    case UnaryFn::erf:
+      return builder.create<math::ErfOp>(arg.getLoc(), arg);
     }
     llvm_unreachable("unsupported unary function");
   }
@@ -467,6 +485,9 @@ public:
       if (allFloatingPoint)
         return builder.create<arith::MinimumFOp>(arg0.getLoc(), arg0, arg1);
       return builder.create<arith::MinUIOp>(arg0.getLoc(), arg0, arg1);
+    case BinaryFn::powf:
+      assert(allFloatingPoint);
+      return builder.create<math::PowFOp>(arg0.getLoc(), arg0, arg1);
     }
     llvm_unreachable("unsupported binary function");
   }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index e836f0dc63b4..ef9a30be9a01 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1499,11 +1499,11 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp,
   // If the input vector sizes are not provided, then the vector sizes are
   // determined by the result tensor shape. In case the vector sizes aren't
   // provided, we update the inBounds attribute instead of masking.
-  bool useInBoundsInsteadOfMasking = true;
+  bool useInBoundsInsteadOfMasking = false;
   if (inputVectorSizes.empty()) {
     ArrayRef<int64_t> resultTensorShape = packOp.getDestType().getShape();
     inputVectorSizes = resultTensorShape.take_front(packOp.getSourceRank());
-    useInBoundsInsteadOfMasking = false;
+    useInBoundsInsteadOfMasking = true;
   }
 
   // Create masked TransferReadOp.
@@ -1612,7 +1612,8 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, tensor::UnPackOp unpackOp,
   // to shape of source, then a mask is necessary.
   Value readResult = vector::createReadOrMaskedRead(
       rewriter, loc, unpackOp.getSource(),
-      ArrayRef<int64_t>(readMaskShape.begin(), readMaskShape.end()), padValue);
+      ArrayRef<int64_t>(readMaskShape.begin(), readMaskShape.end()), padValue,
+      /*useInBoundsInsteadOfMasking=*/false);
 
   PackingMetadata packMetadata;
   SmallVector<int64_t> lastDimToInsertPosPerm =
@@ -1669,7 +1670,8 @@ vectorizeAsTensorPadOp(RewriterBase &rewriter, tensor::PadOp padOp,
   (void)status; // prevent unused variable warning on non-assert builds
   assert(succeeded(status) && "failed to reify result shapes");
   auto maskedRead = vector::createReadOrMaskedRead(
-      rewriter, loc, padOp.getSource(), inputVectorSizes, padValue);
+      rewriter, loc, padOp.getSource(), inputVectorSizes, padValue,
+      /*useInBoundsInsteadOfMasking=*/false);
   Operation *write = createWriteOrMaskedWrite(
       rewriter, loc, maskedRead, reifiedReturnShapes[0], inputVectorSizes);
   newResults.push_back(write->getResult(0));
diff --git a/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp b/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp
index 4449733f0daf..77c108aab480 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp
@@ -13,7 +13,6 @@
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/MemRef/Transforms/Transforms.h"
 #include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
@@ -24,7 +23,6 @@
 #include "mlir/Support/MathExtras.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <type_traits>
 
@@ -430,6 +428,33 @@ struct ConvertMemRefSubview final : OpConversionPattern<memref::SubViewOp> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// ConvertMemRefCollapseShape
+//===----------------------------------------------------------------------===//
+
+/// Emulating a `memref.collapse_shape` becomes a no-op after emulation given
+/// that we flatten memrefs to a single dimension as part of the emulation and
+/// there is no dimension to collapse any further.
+struct ConvertMemRefCollapseShape final
+    : OpConversionPattern<memref::CollapseShapeOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(memref::CollapseShapeOp collapseShapeOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Value srcVal = adaptor.getSrc();
+    auto newTy = dyn_cast<MemRefType>(srcVal.getType());
+    if (!newTy)
+      return failure();
+
+    if (newTy.getRank() != 1)
+      return failure();
+
+    rewriter.replaceOp(collapseShapeOp, srcVal);
+    return success();
+  }
+};
+
 } // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
@@ -442,7 +467,8 @@ void memref::populateMemRefNarrowTypeEmulationPatterns(
 
   // Populate `memref.*` conversion patterns.
   patterns.add<ConvertMemRefAllocation<memref::AllocOp>,
-               ConvertMemRefAllocation<memref::AllocaOp>, ConvertMemRefLoad,
+               ConvertMemRefAllocation<memref::AllocaOp>,
+               ConvertMemRefCollapseShape, ConvertMemRefLoad,
                ConvertMemrefStore, ConvertMemRefAssumeAlignment,
                ConvertMemRefSubview, ConvertMemRefReinterpretCast>(
       typeConverter, patterns.getContext());
diff --git a/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp b/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp
index 96eb7cfd2db6..585c5b738142 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp
@@ -550,6 +550,89 @@ getCollapsedStride(memref::CollapseShapeOp collapseShape, OpBuilder &builder,
   return {makeComposedFoldedAffineMin(builder, collapseShape.getLoc(), minMap,
                                       groupStrides)};
 }
+
+/// From `reshape_like(memref, subSizes, subStrides))` compute
+///
+/// \verbatim
+/// baseBuffer, baseOffset, baseSizes, baseStrides =
+///     extract_strided_metadata(memref)
+/// strides#i = baseStrides#i * subStrides#i
+/// sizes = subSizes
+/// \endverbatim
+///
+/// and return {baseBuffer, baseOffset, sizes, strides}
+template <typename ReassociativeReshapeLikeOp>
+static FailureOr<StridedMetadata> resolveReshapeStridedMetadata(
+    RewriterBase &rewriter, ReassociativeReshapeLikeOp reshape,
+    function_ref<SmallVector<OpFoldResult>(
+        ReassociativeReshapeLikeOp, OpBuilder &,
+        ArrayRef<OpFoldResult> /*origSizes*/, unsigned /*groupId*/)>
+        getReshapedSizes,
+    function_ref<SmallVector<OpFoldResult>(
+        ReassociativeReshapeLikeOp, OpBuilder &,
+        ArrayRef<OpFoldResult> /*origSizes*/,
+        ArrayRef<OpFoldResult> /*origStrides*/, unsigned /*groupId*/)>
+        getReshapedStrides) {
+  // Build a plain extract_strided_metadata(memref) from
+  // extract_strided_metadata(reassociative_reshape_like(memref)).
+  Location origLoc = reshape.getLoc();
+  Value source = reshape.getSrc();
+  auto sourceType = cast<MemRefType>(source.getType());
+  unsigned sourceRank = sourceType.getRank();
+
+  auto newExtractStridedMetadata =
+      rewriter.create<memref::ExtractStridedMetadataOp>(origLoc, source);
+
+  // Collect statically known information.
+  auto [strides, offset] = getStridesAndOffset(sourceType);
+  MemRefType reshapeType = reshape.getResultType();
+  unsigned reshapeRank = reshapeType.getRank();
+
+  OpFoldResult offsetOfr =
+      ShapedType::isDynamic(offset)
+          ? getAsOpFoldResult(newExtractStridedMetadata.getOffset())
+          : rewriter.getIndexAttr(offset);
+
+  // Get the special case of 0-D out of the way.
+  if (sourceRank == 0) {
+    SmallVector<OpFoldResult> ones(reshapeRank, rewriter.getIndexAttr(1));
+    return StridedMetadata{newExtractStridedMetadata.getBaseBuffer(), offsetOfr,
+                           /*sizes=*/ones, /*strides=*/ones};
+  }
+
+  SmallVector<OpFoldResult> finalSizes;
+  finalSizes.reserve(reshapeRank);
+  SmallVector<OpFoldResult> finalStrides;
+  finalStrides.reserve(reshapeRank);
+
+  // Compute the reshaped strides and sizes from the base strides and sizes.
+  SmallVector<OpFoldResult> origSizes =
+      getAsOpFoldResult(newExtractStridedMetadata.getSizes());
+  SmallVector<OpFoldResult> origStrides =
+      getAsOpFoldResult(newExtractStridedMetadata.getStrides());
+  unsigned idx = 0, endIdx = reshape.getReassociationIndices().size();
+  for (; idx != endIdx; ++idx) {
+    SmallVector<OpFoldResult> reshapedSizes =
+        getReshapedSizes(reshape, rewriter, origSizes, /*groupId=*/idx);
+    SmallVector<OpFoldResult> reshapedStrides = getReshapedStrides(
+        reshape, rewriter, origSizes, origStrides, /*groupId=*/idx);
+
+    unsigned groupSize = reshapedSizes.size();
+    for (unsigned i = 0; i < groupSize; ++i) {
+      finalSizes.push_back(reshapedSizes[i]);
+      finalStrides.push_back(reshapedStrides[i]);
+    }
+  }
+  assert(((isa<memref::ExpandShapeOp>(reshape) && idx == sourceRank) ||
+          (isa<memref::CollapseShapeOp>(reshape) && idx == reshapeRank)) &&
+         "We should have visited all the input dimensions");
+  assert(finalSizes.size() == reshapeRank &&
+         "We should have populated all the values");
+
+  return StridedMetadata{newExtractStridedMetadata.getBaseBuffer(), offsetOfr,
+                         finalSizes, finalStrides};
+}
+
 /// Replace `baseBuffer, offset, sizes, strides =
 ///              extract_strided_metadata(reshapeLike(memref))`
 /// With
@@ -580,68 +663,65 @@ public:
 
   LogicalResult matchAndRewrite(ReassociativeReshapeLikeOp reshape,
                                 PatternRewriter &rewriter) const override {
-    // Build a plain extract_strided_metadata(memref) from
-    // extract_strided_metadata(reassociative_reshape_like(memref)).
-    Location origLoc = reshape.getLoc();
-    Value source = reshape.getSrc();
-    auto sourceType = cast<MemRefType>(source.getType());
-    unsigned sourceRank = sourceType.getRank();
-
-    auto newExtractStridedMetadata =
-        rewriter.create<memref::ExtractStridedMetadataOp>(origLoc, source);
-
-    // Collect statically known information.
-    auto [strides, offset] = getStridesAndOffset(sourceType);
-    MemRefType reshapeType = reshape.getResultType();
-    unsigned reshapeRank = reshapeType.getRank();
-
-    OpFoldResult offsetOfr =
-        ShapedType::isDynamic(offset)
-            ? getAsOpFoldResult(newExtractStridedMetadata.getOffset())
-            : rewriter.getIndexAttr(offset);
-
-    // Get the special case of 0-D out of the way.
-    if (sourceRank == 0) {
-      SmallVector<OpFoldResult> ones(reshapeRank, rewriter.getIndexAttr(1));
-      auto memrefDesc = rewriter.create<memref::ReinterpretCastOp>(
-          origLoc, reshapeType, newExtractStridedMetadata.getBaseBuffer(),
-          offsetOfr, /*sizes=*/ones, /*strides=*/ones);
-      rewriter.replaceOp(reshape, memrefDesc.getResult());
-      return success();
+    FailureOr<StridedMetadata> stridedMetadata =
+        resolveReshapeStridedMetadata<ReassociativeReshapeLikeOp>(
+            rewriter, reshape, getReshapedSizes, getReshapedStrides);
+    if (failed(stridedMetadata)) {
+      return rewriter.notifyMatchFailure(reshape,
+                                         "failed to resolve reshape metadata");
     }
 
-    SmallVector<OpFoldResult> finalSizes;
-    finalSizes.reserve(reshapeRank);
-    SmallVector<OpFoldResult> finalStrides;
-    finalStrides.reserve(reshapeRank);
-
-    // Compute the reshaped strides and sizes from the base strides and sizes.
-    SmallVector<OpFoldResult> origSizes =
-        getAsOpFoldResult(newExtractStridedMetadata.getSizes());
-    SmallVector<OpFoldResult> origStrides =
-        getAsOpFoldResult(newExtractStridedMetadata.getStrides());
-    unsigned idx = 0, endIdx = reshape.getReassociationIndices().size();
-    for (; idx != endIdx; ++idx) {
-      SmallVector<OpFoldResult> reshapedSizes =
-          getReshapedSizes(reshape, rewriter, origSizes, /*groupId=*/idx);
-      SmallVector<OpFoldResult> reshapedStrides = getReshapedStrides(
-          reshape, rewriter, origSizes, origStrides, /*groupId=*/idx);
-
-      unsigned groupSize = reshapedSizes.size();
-      for (unsigned i = 0; i < groupSize; ++i) {
-        finalSizes.push_back(reshapedSizes[i]);
-        finalStrides.push_back(reshapedStrides[i]);
-      }
+    rewriter.replaceOpWithNewOp<memref::ReinterpretCastOp>(
+        reshape, reshape.getType(), stridedMetadata->basePtr,
+        stridedMetadata->offset, stridedMetadata->sizes,
+        stridedMetadata->strides);
+    return success();
+  }
+};
+
+/// Pattern to replace `extract_strided_metadata(collapse_shape)`
+/// With
+///
+/// \verbatim
+/// baseBuffer, baseOffset, baseSizes, baseStrides =
+///     extract_strided_metadata(memref)
+/// strides#i = baseStrides#i * subSizes#i
+/// offset = baseOffset + sum(subOffset#i * baseStrides#i)
+/// sizes = subSizes
+/// \verbatim
+///
+/// with `baseBuffer`, `offset`, `sizes` and `strides` being
+/// the replacements for the original `extract_strided_metadata`.
+struct ExtractStridedMetadataOpCollapseShapeFolder
+    : OpRewritePattern<memref::ExtractStridedMetadataOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(memref::ExtractStridedMetadataOp op,
+                                PatternRewriter &rewriter) const override {
+    auto collapseShapeOp =
+        op.getSource().getDefiningOp<memref::CollapseShapeOp>();
+    if (!collapseShapeOp)
+      return failure();
+
+    FailureOr<StridedMetadata> stridedMetadata =
+        resolveReshapeStridedMetadata<memref::CollapseShapeOp>(
+            rewriter, collapseShapeOp, getCollapsedSize, getCollapsedStride);
+    if (failed(stridedMetadata)) {
+      return rewriter.notifyMatchFailure(
+          op,
+          "failed to resolve metadata in terms of source collapse_shape op");
     }
-    assert(((isa<memref::ExpandShapeOp>(reshape) && idx == sourceRank) ||
-            (isa<memref::CollapseShapeOp>(reshape) && idx == reshapeRank)) &&
-           "We should have visited all the input dimensions");
-    assert(finalSizes.size() == reshapeRank &&
-           "We should have populated all the values");
-    auto memrefDesc = rewriter.create<memref::ReinterpretCastOp>(
-        origLoc, reshapeType, newExtractStridedMetadata.getBaseBuffer(),
-        offsetOfr, finalSizes, finalStrides);
-    rewriter.replaceOp(reshape, memrefDesc.getResult());
+
+    Location loc = collapseShapeOp.getLoc();
+    SmallVector<Value> results;
+    results.push_back(stridedMetadata->basePtr);
+    results.push_back(getValueOrCreateConstantIndexOp(rewriter, loc,
+                                                      stridedMetadata->offset));
+    results.append(
+        getValueOrCreateConstantIndexOp(rewriter, loc, stridedMetadata->sizes));
+    results.append(getValueOrCreateConstantIndexOp(rewriter, loc,
+                                                   stridedMetadata->strides));
+    rewriter.replaceOp(op, results);
     return success();
   }
 };
@@ -1018,9 +1098,11 @@ void memref::populateExpandStridedMetadataPatterns(
                              getCollapsedStride>,
                ExtractStridedMetadataOpAllocFolder<memref::AllocOp>,
                ExtractStridedMetadataOpAllocFolder<memref::AllocaOp>,
+               ExtractStridedMetadataOpCollapseShapeFolder,
                ExtractStridedMetadataOpGetGlobalFolder,
                RewriteExtractAlignedPointerAsIndexOfViewLikeOp,
                ExtractStridedMetadataOpReinterpretCastFolder,
+               ExtractStridedMetadataOpSubviewFolder,
                ExtractStridedMetadataOpCastFolder,
                ExtractStridedMetadataOpExtractStridedMetadataFolder>(
       patterns.getContext());
@@ -1030,6 +1112,7 @@ void memref::populateResolveExtractStridedMetadataPatterns(
     RewritePatternSet &patterns) {
   patterns.add<ExtractStridedMetadataOpAllocFolder<memref::AllocOp>,
                ExtractStridedMetadataOpAllocFolder<memref::AllocaOp>,
+               ExtractStridedMetadataOpCollapseShapeFolder,
                ExtractStridedMetadataOpGetGlobalFolder,
                ExtractStridedMetadataOpSubviewFolder,
                RewriteExtractAlignedPointerAsIndexOfViewLikeOp,
diff --git a/mlir/lib/Dialect/Polynomial/IR/CMakeLists.txt b/mlir/lib/Dialect/Polynomial/IR/CMakeLists.txt
index 7f5b3255d5d9..d6e703b8b359 100644
--- a/mlir/lib/Dialect/Polynomial/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/Polynomial/IR/CMakeLists.txt
@@ -16,4 +16,5 @@ add_mlir_dialect_library(MLIRPolynomialDialect
   MLIRSupport
   MLIRDialect
   MLIRIR
+  MLIRInferTypeOpInterface
   )
diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp b/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp
index ee09c73bb3c4..f1ec2be72a33 100644
--- a/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp
+++ b/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp
@@ -172,7 +172,7 @@ Attribute RingAttr::parse(AsmParser &parser, Type type) {
     if (failed(parser.parseEqual()))
       return {};
 
-    IntegerType iType = ty.dyn_cast<IntegerType>();
+    IntegerType iType = mlir::dyn_cast<IntegerType>(ty);
     if (!iType) {
       parser.emitError(parser.getCurrentLocation(),
                        "coefficientType must specify an integer type");
diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialDialect.cpp b/mlir/lib/Dialect/Polynomial/IR/PolynomialDialect.cpp
index a672a59b8a46..825b80d70f80 100644
--- a/mlir/lib/Dialect/Polynomial/IR/PolynomialDialect.cpp
+++ b/mlir/lib/Dialect/Polynomial/IR/PolynomialDialect.cpp
@@ -8,9 +8,18 @@
 
 #include "mlir/Dialect/Polynomial/IR/Polynomial.h"
 
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Polynomial/IR/PolynomialAttributes.h"
 #include "mlir/Dialect/Polynomial/IR/PolynomialOps.h"
 #include "mlir/Dialect/Polynomial/IR/PolynomialTypes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/TypeSwitch.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp b/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp
index 96c59a28b8fd..8e2bb5f27dc6 100644
--- a/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp
+++ b/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp
@@ -6,10 +6,101 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/Polynomial/IR/PolynomialOps.h"
 #include "mlir/Dialect/Polynomial/IR/Polynomial.h"
+#include "mlir/Dialect/Polynomial/IR/PolynomialAttributes.h"
+#include "mlir/Dialect/Polynomial/IR/PolynomialTypes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/APInt.h"
 
 using namespace mlir;
 using namespace mlir::polynomial;
 
-#define GET_OP_CLASSES
-#include "mlir/Dialect/Polynomial/IR/Polynomial.cpp.inc"
+void FromTensorOp::build(OpBuilder &builder, OperationState &result,
+                         Value input, RingAttr ring) {
+  TensorType tensorType = dyn_cast<TensorType>(input.getType());
+  auto bitWidth = tensorType.getElementTypeBitWidth();
+  APInt cmod(1 + bitWidth, 1);
+  cmod = cmod << bitWidth;
+  Type resultType = PolynomialType::get(builder.getContext(), ring);
+  build(builder, result, resultType, input);
+}
+
+LogicalResult FromTensorOp::verify() {
+  ArrayRef<int64_t> tensorShape = getInput().getType().getShape();
+  RingAttr ring = getOutput().getType().getRing();
+  unsigned polyDegree = ring.getPolynomialModulus().getPolynomial().getDegree();
+  bool compatible = tensorShape.size() == 1 && tensorShape[0] <= polyDegree;
+  if (!compatible) {
+    InFlightDiagnostic diag = emitOpError()
+                              << "input type " << getInput().getType()
+                              << " does not match output type "
+                              << getOutput().getType();
+    diag.attachNote() << "the input type must be a tensor of shape [d] where d "
+                         "is at most the degree of the polynomialModulus of "
+                         "the output type's ring attribute";
+    return diag;
+  }
+
+  APInt coefficientModulus = ring.getCoefficientModulus().getValue();
+  unsigned cmodBitWidth = coefficientModulus.ceilLogBase2();
+  unsigned inputBitWidth = getInput().getType().getElementTypeBitWidth();
+
+  if (inputBitWidth > cmodBitWidth) {
+    InFlightDiagnostic diag = emitOpError()
+                              << "input tensor element type "
+                              << getInput().getType().getElementType()
+                              << " is too large to fit in the coefficients of "
+                              << getOutput().getType();
+    diag.attachNote() << "the input tensor's elements must be rescaled"
+                         " to fit before using from_tensor";
+    return diag;
+  }
+
+  return success();
+}
+
+LogicalResult ToTensorOp::verify() {
+  ArrayRef<int64_t> tensorShape = getOutput().getType().getShape();
+  unsigned polyDegree = getInput()
+                            .getType()
+                            .getRing()
+                            .getPolynomialModulus()
+                            .getPolynomial()
+                            .getDegree();
+  bool compatible = tensorShape.size() == 1 && tensorShape[0] == polyDegree;
+
+  if (compatible)
+    return success();
+
+  InFlightDiagnostic diag =
+      emitOpError() << "input type " << getInput().getType()
+                    << " does not match output type " << getOutput().getType();
+  diag.attachNote() << "the output type must be a tensor of shape [d] where d "
+                       "is at most the degree of the polynomialModulus of "
+                       "the input type's ring attribute";
+  return diag;
+}
+
+LogicalResult MulScalarOp::verify() {
+  Type argType = getPolynomial().getType();
+  PolynomialType polyType;
+
+  if (auto shapedPolyType = dyn_cast<ShapedType>(argType)) {
+    polyType = cast<PolynomialType>(shapedPolyType.getElementType());
+  } else {
+    polyType = cast<PolynomialType>(argType);
+  }
+
+  Type coefficientType = polyType.getRing().getCoefficientType();
+
+  if (coefficientType != getScalar().getType())
+    return emitOpError() << "polynomial coefficient type " << coefficientType
+                         << " does not match scalar type "
+                         << getScalar().getType();
+
+  return success();
+}
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
index 5a39dfc62077..9a8c6422a7ff 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@@ -289,6 +289,37 @@ struct FuseExtractSliceWithConcat
   }
 };
 
+/// Rewriting rule that fuses sparse_tensor.convert into producer.
+struct FoldConvertIntoProducer : public OpRewritePattern<ConvertOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ConvertOp op,
+                                PatternRewriter &rewriter) const override {
+    auto producer = op.getSource().getDefiningOp<GenericOp>();
+    if (!producer || producer.getDpsInits().size() != 1 ||
+        !isMaterializing(producer.getDpsInitOperand(0), false) ||
+        !producer.getResult(0).hasOneUse()) {
+      return failure();
+    }
+    rewriter.modifyOpInPlace(producer, [&]() {
+      producer.getResult(0).setType(op.getResult().getType());
+    });
+
+    Operation *materializeOp =
+        producer.getDpsInitOperand(0)->get().getDefiningOp();
+
+    rewriter.modifyOpInPlace(materializeOp, [&]() {
+      materializeOp->getResult(0).setType(op.getResult().getType());
+    });
+
+    rewriter.replaceAllOpUsesWith(op, producer);
+    op->erase();
+
+    return success();
+  }
+};
+
 /// Rewriting rule that converts direct yield of zero with initial allocation.
 struct FoldInvariantYield : public OpRewritePattern<GenericOp> {
 public:
@@ -1506,9 +1537,10 @@ struct OutRewriter : public OpRewritePattern<OutOp> {
 //===---------------------------------------------------------------------===//
 
 void mlir::populatePreSparsificationRewriting(RewritePatternSet &patterns) {
-  patterns.add<FuseExtractSliceWithConcat, FoldInvariantYield,
-               FuseSparseMultiplyOverAdd, FuseTensorCast, GenSemiRingReduction,
-               GenSemiRingSelect, PrintRewriter>(patterns.getContext());
+  patterns.add<FuseExtractSliceWithConcat, FoldConvertIntoProducer,
+               FoldInvariantYield, FuseSparseMultiplyOverAdd, FuseTensorCast,
+               GenSemiRingReduction, GenSemiRingSelect, PrintRewriter>(
+      patterns.getContext());
 }
 
 void mlir::populateLowerSparseOpsToForeachPatterns(RewritePatternSet &patterns,
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
index cd046b670d9a..0a9bb40b458d 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -403,6 +403,22 @@ static Value genInsertionLoadReduce(CodegenEnv &env, OpBuilder &builder,
   return builder.create<arith::SelectOp>(loc, isFilled, valAtIndex, identity);
 }
 
+static Value genConditionalInsert(Location loc, OpBuilder &builder, Value cond,
+                                  Value sparseOut, ValueRange ivs, Value v) {
+  scf::IfOp condInsert =
+      builder.create<scf::IfOp>(loc, sparseOut.getType(), cond, true);
+  // True branch.
+  builder.setInsertionPointToStart(condInsert.thenBlock());
+  Value res = builder.create<tensor::InsertOp>(loc, v, sparseOut, ivs);
+  builder.create<scf::YieldOp>(loc, res);
+  // False branch.
+  builder.setInsertionPointToStart(condInsert.elseBlock());
+  builder.create<scf::YieldOp>(loc, sparseOut);
+  // Value assignment.
+  builder.setInsertionPointAfter(condInsert);
+  return condInsert.getResult(0);
+}
+
 /// Generates insertion code to implement dynamic tensor store.
 static void genInsertionStore(CodegenEnv &env, OpBuilder &builder, OpOperand *t,
                               Value rhs) {
@@ -423,23 +439,21 @@ static void genInsertionStore(CodegenEnv &env, OpBuilder &builder, OpOperand *t,
       //     return updated chain
       //   else
       //     return unmodified chain
-      scf::IfOp ifValidLexInsert = builder.create<scf::IfOp>(
-          loc, chain.getType(), env.getValidLexInsert(),
-          /*else=*/true);
-      // True branch.
-      builder.setInsertionPointToStart(ifValidLexInsert.thenBlock());
-      Value res = builder.create<tensor::InsertOp>(loc, rhs, chain, ivs);
-      builder.create<scf::YieldOp>(loc, res);
-      // False branch.
-      builder.setInsertionPointToStart(ifValidLexInsert.elseBlock());
-      builder.create<scf::YieldOp>(loc, chain);
-      // Value assignment.
-      builder.setInsertionPointAfter(ifValidLexInsert);
-      env.updateInsertionChain(ifValidLexInsert.getResult(0));
+      Value out = genConditionalInsert(loc, builder, env.getValidLexInsert(),
+                                       chain, ivs, rhs);
+      env.updateInsertionChain(out);
     } else {
+      Value sparseOut;
+      if (!hasAnySparseType(env.op().getInputs().getTypes())) {
+        // This is an all-dense -> sparse kernel, test rhs != 0 before
+        // insertion.
+        Value nz = genIsNonzero(builder, loc, rhs);
+        sparseOut = genConditionalInsert(loc, builder, nz, chain, ivs, rhs);
+      } else {
+        sparseOut = builder.create<tensor::InsertOp>(loc, rhs, chain, ivs);
+      }
       // Generates regular insertion chain.
-      env.updateInsertionChain(
-          builder.create<tensor::InsertOp>(loc, rhs, chain, ivs));
+      env.updateInsertionChain(sparseOut);
     }
     return;
   }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
index 59c3e49264db..34312df91299 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.h
@@ -222,7 +222,7 @@ public:
   ///
   SmallVector<Value> getValPosits(TensorId tid) const {
     SmallVector<Value> batchCrds = iters[tid].back().back()->getBatchCrds();
-    Value lastLvlPos = iters[tid].back().back()->getCurPosition().first;
+    Value lastLvlPos = iters[tid].back().back()->getCurPosition().front();
     batchCrds.push_back(lastLvlPos);
     return batchCrds;
   };
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp
index 60dca3c55dec..745c081247de 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp
@@ -94,8 +94,10 @@ public:
 
   ValueRange getLvlBuffers() const override { return {}; }
 
-  ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange, Value p,
-                        Value max) const override {
+  ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange batchPrefix,
+                        ValueRange parentPos) const override {
+    assert(parentPos.size() == 1 && "Dense level can not be non-unique.");
+    Value p = parentPos.front();
     Value posLo = MULI(p, lvlSize);
     return {posLo, lvlSize};
   }
@@ -112,9 +114,9 @@ public:
 
   ValueRange getLvlBuffers() const override { return {}; }
 
-  ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange, Value p,
-                        Value max) const override {
-    assert(max == nullptr && "Dense level can not be non-unique.");
+  ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange,
+                        ValueRange parentPos) const override {
+    assert(parentPos.size() == 1 && "Dense level can not be non-unique.");
     // No need to linearize the position for non-annotated tensors.
     return {C_IDX(0), lvlSize};
   }
@@ -127,9 +129,11 @@ public:
       : SparseLevel(tid, lvl, lt, lvlSize, {posBuffer, crdBuffer}) {}
 
   ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange batchPrefix,
-                        Value p, Value max) const override {
-    assert(max == nullptr &&
+                        ValueRange parentPos) const override {
+
+    assert(parentPos.size() == 1 &&
            "compressed level must be the first non-unique level.");
+    Value p = parentPos.front();
 
     SmallVector<Value> memCrd(batchPrefix);
     memCrd.push_back(p);
@@ -147,11 +151,11 @@ public:
       : SparseLevel(tid, lvl, lt, lvlSize, {posBuffer, crdBuffer}) {}
 
   ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange batchPrefix,
-                        Value p, Value max) const override {
-    assert(max == nullptr &&
+                        ValueRange parentPos) const override {
+    assert(parentPos.size() == 1 &&
            "loose-compressed level must be the first non-unique level.");
     SmallVector<Value> memCrd(batchPrefix);
-
+    Value p = parentPos.front();
     p = MULI(p, C_IDX(2));
     memCrd.push_back(p);
     Value pLo = genIndexLoad(b, l, getPosBuf(), memCrd);
@@ -168,10 +172,13 @@ public:
       : SparseLevel(tid, lvl, lt, lvlSize, {crdBuffer}) {}
 
   ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange batchPrefix,
-                        Value p, Value segHi) const override {
+                        ValueRange parentPos) const override {
+    assert(parentPos.size() == 1 || parentPos.size() == 2);
+    Value p = parentPos.front();
+    Value segHi = parentPos.size() == 2 ? parentPos.back() : nullptr;
+
     if (segHi == nullptr)
       return {p, ADDI(p, C_IDX(1))};
-
     // Use the segHi as the loop upper bound.
     return {p, segHi};
   }
@@ -184,11 +191,12 @@ public:
       : SparseLevel(tid, lvl, lt, lvlSize, {crdBuffer}) {}
 
   ValuePair peekRangeAt(OpBuilder &b, Location l, ValueRange batchPrefix,
-                        Value p, Value max) const override {
-    assert(max == nullptr && isUnique() && "n:m level can not be non-unique.");
+                        ValueRange parentPos) const override {
+    assert(parentPos.size() == 1 && isUnique() &&
+           "n:m level can not be non-unique.");
     // Each n:m blk has exactly n specified elements.
     auto n = getN(lt);
-    Value posLo = MULI(p, C_IDX(n));
+    Value posLo = MULI(parentPos.front(), C_IDX(n));
     return {posLo, ADDI(posLo, C_IDX(n))};
   }
 };
@@ -316,23 +324,21 @@ public:
       posHi = vs.back();
   };
 
-  ValuePair getCurPosition() const override { return {getItPos(), nullptr}; }
-
   void genInitImpl(OpBuilder &b, Location l,
                    const SparseIterator *parent) override {
 
     if (isBatchIterator() && batchCrds.size() <= stl.lvl)
       batchCrds.resize(stl.lvl + 1, nullptr);
 
-    Value pos = C_IDX(0);
-    Value hi = nullptr;
+    Value c0 = C_IDX(0);
+    ValueRange pPos = c0;
     // If the parent iterator is a batch iterator, we also start from 0 (but
     // on a different batch).
     if (parent && !parent->isBatchIterator())
-      std::tie(pos, hi) = parent->getCurPosition();
+      pPos = parent->getCurPosition();
 
     ValueRange batchPrefix = parent ? parent->getBatchCrds() : ValueRange{};
-    std::tie(posLo, posHi) = stl.peekRangeAt(b, l, batchPrefix, pos, hi);
+    std::tie(posLo, posHi) = stl.peekRangeAt(b, l, batchPrefix, pPos);
     // Seek to the lowest position.
     seek(posLo);
   }
@@ -406,21 +412,19 @@ public:
     return {b.getIndexType(), b.getIndexType()};
   }
 
-  ValuePair getCurPosition() const override { return {getPos(), getSegHi()}; }
-
   void genInitImpl(OpBuilder &b, Location l,
                    const SparseIterator *parent) override {
+    Value c0 = C_IDX(0);
+    ValueRange pPos = c0;
 
-    Value pos = C_IDX(0);
-    Value hi = nullptr;
     // If the parent iterator is a batch iterator, we also start from 0 (but
     // on a different batch).
     if (parent && !parent->isBatchIterator())
-      std::tie(pos, hi) = parent->getCurPosition();
+      pPos = parent->getCurPosition();
 
     Value posLo;
     ValueRange batchPrefix = parent ? parent->getBatchCrds() : ValueRange{};
-    std::tie(posLo, posHi) = stl.peekRangeAt(b, l, batchPrefix, pos, hi);
+    std::tie(posLo, posHi) = stl.peekRangeAt(b, l, batchPrefix, pPos);
 
     seek({posLo, genSegmentHigh(b, l, posLo)});
   }
@@ -505,7 +509,7 @@ public:
 
   SmallVector<Value> serialize() const override { return wrap->serialize(); };
   void deserialize(ValueRange vs) override { wrap->deserialize(vs); };
-  ValuePair getCurPosition() const override { return wrap->getCurPosition(); }
+  ValueRange getCurPosition() const override { return wrap->getCurPosition(); }
 
   void genInitImpl(OpBuilder &b, Location l,
                    const SparseIterator *parent) override {
@@ -756,9 +760,8 @@ public:
   Value upperBound(OpBuilder &b, Location l) const override {
     return subSect.subSectSz;
   }
-  std::pair<Value, Value> getCurPosition() const override {
-    return wrap->getCurPosition();
-  };
+
+  ValueRange getCurPosition() const override { return wrap->getCurPosition(); };
 
   Value getNxLvlTupleId(OpBuilder &b, Location l) const {
     if (randomAccessible()) {
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h
index 9d69a2335559..b692848ec67b 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h
@@ -36,8 +36,9 @@ public:
                           Value iv) const = 0;
 
   /// Peeks the lower and upper bound to *fully* traverse the level with
-  /// the given position `p` that the immediate parent level is current at.
-  /// Returns a pair of values for *posLo* and *loopHi* respectively.
+  /// the given position `parentPos`, see SparseTensorIterator::getCurPostion(),
+  /// that the immediate parent level is current at. Returns a pair of values
+  /// for *posLo* and *loopHi* respectively.
   ///
   /// For a dense level, the *posLo* is the linearized position at beginning,
   /// while *loopHi* is the largest *coordinate*, it also implies that the
@@ -45,12 +46,9 @@ public:
   ///
   /// For a sparse level, [posLo, loopHi) specifies the range of index pointer
   /// to load coordinate from the coordinate buffer.
-  ///
-  /// `bound` is only used when the level is `non-unique` and deduplication is
-  /// required. It specifies the max upper bound of the non-unique segment.
   virtual std::pair<Value, Value> peekRangeAt(OpBuilder &b, Location l,
-                                              ValueRange batchPrefix, Value p,
-                                              Value segHi = Value()) const = 0;
+                                              ValueRange batchPrefix,
+                                              ValueRange parentPos) const = 0;
 
   Level getLevel() const { return lvl; }
   LevelType getLT() const { return lt; }
@@ -199,18 +197,17 @@ public:
   }
   virtual Value genNotEndImpl(OpBuilder &b, Location l) = 0;
   virtual Value derefImpl(OpBuilder &b, Location l) = 0;
-  // Gets the current position and the optional *position high* (for
-  // non-unique iterators), the value is essentially the number of sparse
-  // coordinate that the iterator is current visiting. It should be able to
-  // uniquely identify the sparse range for the next level. See
-  // SparseTensorLevel::peekRangeAt();
+  // Gets the ValueRange that together specifies the current position of the
+  // iterator. For a unique level, the position can be a single index points to
+  // the current coordinate being visited. For a non-unique level, an extra
+  // index for the `segment high` is needed to to specifies the range of
+  // duplicated coordinates. The ValueRange should be able to uniquely identify
+  // the sparse range for the next level. See SparseTensorLevel::peekRangeAt();
   //
   // Not every type of iterator supports the operation, e.g., non-empty
   // subsection iterator does not because it represent a range of coordinates
   // instead of just one.
-  virtual std::pair<Value, Value> getCurPosition() const {
-    llvm_unreachable("unsupported");
-  };
+  virtual ValueRange getCurPosition() const { return getCursor(); };
 
   // Returns a pair of values for *upper*, *lower* bound respectively.
   virtual std::pair<Value, Value> genForCond(OpBuilder &b, Location l) {
@@ -284,9 +281,10 @@ private:
 };
 
 /// Helper function to create a TensorLevel object from given `tensor`.
-std::unique_ptr<SparseTensorLevel> makeSparseTensorLevel(OpBuilder &builder,
-                                                         Location loc, Value t,
-                                                         unsigned tid, Level l);
+std::unique_ptr<SparseTensorLevel> makeSparseTensorLevel(OpBuilder &b,
+                                                         Location l, Value t,
+                                                         unsigned tid,
+                                                         Level lvl);
 
 /// Helper function to create a simple SparseIterator object that iterate over
 /// the SparseTensorLevel.
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index 3ff41ab22fbc..5029ed4aa038 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -1609,6 +1609,9 @@ OpFoldResult ReshapeOp::fold(FoldAdaptor adaptor) {
             cst.has_value() && cst.value() == static_cast<int64_t>(id);
         continue;
       }
+
+      dynamicNoop = false;
+      break;
     }
 
     if (dynamicNoop)
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
index 69999f0918c1..802a64b0805e 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
@@ -140,7 +140,7 @@ struct LinearizeVectorExtractStridedSlice final
                   ConversionPatternRewriter &rewriter) const override {
     Type dstType = getTypeConverter()->convertType(extractOp.getType());
     assert(!(extractOp.getVector().getType().isScalable() ||
-             dstType.cast<VectorType>().isScalable()) &&
+             cast<VectorType>(dstType).isScalable()) &&
            "scalable vectors are not supported.");
     if (!isLessThanTargetBitWidth(extractOp, targetVectorBitWidth))
       return rewriter.notifyMatchFailure(
@@ -172,7 +172,7 @@ struct LinearizeVectorExtractStridedSlice final
     // Get total number of extracted slices.
     int64_t nExtractedSlices = 1;
     for (Attribute size : sizes) {
-      nExtractedSlices *= size.cast<IntegerAttr>().getInt();
+      nExtractedSlices *= cast<IntegerAttr>(size).getInt();
     }
     // Compute the strides of the source vector considering first k dimensions.
     llvm::SmallVector<int64_t, 4> sourceStrides(kD, extractGranularitySize);
@@ -189,7 +189,7 @@ struct LinearizeVectorExtractStridedSlice final
     // Compute extractedStrides.
     for (int i = kD - 2; i >= 0; --i) {
       extractedStrides[i] =
-          extractedStrides[i + 1] * sizes[i + 1].cast<IntegerAttr>().getInt();
+          extractedStrides[i + 1] * cast<IntegerAttr>(sizes[i + 1]).getInt();
     }
     // Iterate over all extracted slices from 0 to nExtractedSlices - 1
     // and compute the multi-dimensional index and the corresponding linearized
@@ -207,7 +207,7 @@ struct LinearizeVectorExtractStridedSlice final
       int64_t linearizedIndex = 0;
       for (int64_t j = 0; j < kD; ++j) {
         linearizedIndex +=
-            (offsets[j].cast<IntegerAttr>().getInt() + multiDimIndex[j]) *
+            (cast<IntegerAttr>(offsets[j]).getInt() + multiDimIndex[j]) *
             sourceStrides[j];
       }
       // Fill the indices array form linearizedIndex to linearizedIndex +
@@ -254,7 +254,7 @@ struct LinearizeVectorShuffle final
     Type dstType = getTypeConverter()->convertType(shuffleOp.getType());
     assert(!(shuffleOp.getV1VectorType().isScalable() ||
              shuffleOp.getV2VectorType().isScalable() ||
-             dstType.cast<VectorType>().isScalable()) &&
+             cast<VectorType>(dstType).isScalable()) &&
            "scalable vectors are not supported.");
     if (!isLessThanTargetBitWidth(shuffleOp, targetVectorBitWidth))
       return rewriter.notifyMatchFailure(
@@ -324,7 +324,7 @@ struct LinearizeVectorExtract final
                   ConversionPatternRewriter &rewriter) const override {
     Type dstTy = getTypeConverter()->convertType(extractOp.getType());
     assert(!(extractOp.getVector().getType().isScalable() ||
-             dstTy.cast<VectorType>().isScalable()) &&
+             cast<VectorType>(dstTy).isScalable()) &&
            "scalable vectors are not supported.");
     if (!isLessThanTargetBitWidth(extractOp, targetVectorBitWidth))
       return rewriter.notifyMatchFailure(
@@ -405,9 +405,7 @@ void mlir::vector::populateVectorLinearizeShuffleLikeOpsPatterns(
       [=](vector::ShuffleOp shuffleOp) -> bool {
         return isLessThanTargetBitWidth(shuffleOp, targetBitWidth)
                    ? (typeConverter.isLegal(shuffleOp) &&
-                      shuffleOp.getResult()
-                              .getType()
-                              .cast<mlir::VectorType>()
+                      cast<mlir::VectorType>(shuffleOp.getResult().getType())
                               .getRank() == 1)
                    : true;
       });
diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
index fcaf1ec944b4..6727f3f46172 100644
--- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
+++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
@@ -345,7 +345,7 @@ Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc,
   int64_t readRank = readShape.size();
   auto zero = builder.create<arith::ConstantIndexOp>(loc, 0);
   SmallVector<bool> inBoundsVal(readRank, true);
-  if (!useInBoundsInsteadOfMasking) {
+  if (useInBoundsInsteadOfMasking) {
     // Update the inBounds attribute.
     for (unsigned i = 0; i < readRank; i++)
       inBoundsVal[i] = (sourceShape[i] == readShape[i]) &&
@@ -359,7 +359,7 @@ Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc,
       /*padding=*/padValue,
       /*inBounds=*/inBoundsVal);
 
-  if (llvm::equal(readShape, sourceShape) || !useInBoundsInsteadOfMasking)
+  if (llvm::equal(readShape, sourceShape) || useInBoundsInsteadOfMasking)
     return transferReadOp;
   SmallVector<OpFoldResult> mixedSourceDims =
       tensor::getMixedSizes(builder, loc, source);
diff --git a/mlir/lib/Tools/mlir-lsp-server/LSPServer.cpp b/mlir/lib/Tools/mlir-lsp-server/LSPServer.cpp
index bd7f2a5dedc2..0f23366f6fe8 100644
--- a/mlir/lib/Tools/mlir-lsp-server/LSPServer.cpp
+++ b/mlir/lib/Tools/mlir-lsp-server/LSPServer.cpp
@@ -91,7 +91,7 @@ struct LSPServer {
 
   /// An outgoing notification used to send diagnostics to the client when they
   /// are ready to be processed.
-  OutgoingMessage<PublishDiagnosticsParams> publishDiagnostics;
+  OutgoingNotification<PublishDiagnosticsParams> publishDiagnostics;
 
   /// Used to indicate that the 'shutdown' request was received from the
   /// Language Server client.
diff --git a/mlir/lib/Tools/mlir-pdll-lsp-server/LSPServer.cpp b/mlir/lib/Tools/mlir-pdll-lsp-server/LSPServer.cpp
index ffaa1c8d4de4..f02372367e38 100644
--- a/mlir/lib/Tools/mlir-pdll-lsp-server/LSPServer.cpp
+++ b/mlir/lib/Tools/mlir-pdll-lsp-server/LSPServer.cpp
@@ -104,7 +104,7 @@ struct LSPServer {
 
   /// An outgoing notification used to send diagnostics to the client when they
   /// are ready to be processed.
-  OutgoingMessage<PublishDiagnosticsParams> publishDiagnostics;
+  OutgoingNotification<PublishDiagnosticsParams> publishDiagnostics;
 
   /// Used to indicate that the 'shutdown' request was received from the
   /// Language Server client.
diff --git a/mlir/lib/Tools/tblgen-lsp-server/LSPServer.cpp b/mlir/lib/Tools/tblgen-lsp-server/LSPServer.cpp
index bc312d18ea40..b62f68db9d60 100644
--- a/mlir/lib/Tools/tblgen-lsp-server/LSPServer.cpp
+++ b/mlir/lib/Tools/tblgen-lsp-server/LSPServer.cpp
@@ -72,7 +72,7 @@ struct LSPServer {
 
   /// An outgoing notification used to send diagnostics to the client when they
   /// are ready to be processed.
-  OutgoingMessage<PublishDiagnosticsParams> publishDiagnostics;
+  OutgoingNotification<PublishDiagnosticsParams> publishDiagnostics;
 
   /// Used to indicate that the 'shutdown' request was received from the
   /// Language Server client.
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/lang/comprehension.py b/mlir/python/mlir/dialects/linalg/opdsl/lang/comprehension.py
index 23d6d26b7e29..bb43ebf2b692 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/lang/comprehension.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/lang/comprehension.py
@@ -291,6 +291,12 @@ class UnaryFn:
     ceil = UnaryFnType("ceil")
     floor = UnaryFnType("floor")
     negf = UnaryFnType("negf")
+    round = UnaryFnType("round")
+    sqrt = UnaryFnType("sqrt")
+    rsqrt = UnaryFnType("rsqrt")
+    square = UnaryFnType("square")
+    tanh = UnaryFnType("tanh")
+    erf = UnaryFnType("erf")
 
 
 class BinaryFnType:
@@ -330,6 +336,7 @@ class BinaryFn:
     min_signed = BinaryFnType("min_signed")
     max_unsigned = BinaryFnType("max_unsigned")
     min_unsigned = BinaryFnType("min_unsigned")
+    powf = BinaryFnType("powf")
 
 
 class TypeFnType:
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
index 5b05364f6d35..ca2bb0c5f7f8 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
@@ -109,6 +109,78 @@ def negf(
 
 
 @linalg_structured_op
+def round(
+    I=TensorDef(T1),
+    O=TensorDef(T1, output=True),
+):
+    """Applies round(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+    """
+    O[None] = UnaryFn.round(I[None])
+
+
+@linalg_structured_op
+def sqrt(
+    I=TensorDef(T1),
+    O=TensorDef(T1, output=True),
+):
+    """Applies sqrt(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+    """
+    O[None] = UnaryFn.sqrt(I[None])
+
+
+@linalg_structured_op
+def rsqrt(
+    I=TensorDef(T1),
+    O=TensorDef(T1, output=True),
+):
+    """Applies rsqrt(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+    """
+    O[None] = UnaryFn.rsqrt(I[None])
+
+
+@linalg_structured_op
+def square(
+    I=TensorDef(T1),
+    O=TensorDef(T1, output=True),
+):
+    """Applies square(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+    """
+    O[None] = UnaryFn.square(I[None])
+
+
+@linalg_structured_op
+def tanh(
+    I=TensorDef(T1),
+    O=TensorDef(T1, output=True),
+):
+    """Applies tanh(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+    """
+    O[None] = UnaryFn.tanh(I[None])
+
+
+@linalg_structured_op
+def erf(
+    I=TensorDef(T1),
+    O=TensorDef(T1, output=True),
+):
+    """Applies erf(x) elementwise.
+
+    No numeric casting is performed on the input operand.
+    """
+    O[None] = UnaryFn.erf(I[None])
+
+
+@linalg_structured_op
 def elemwise_binary(
     lhs=TensorDef(T1),
     rhs=TensorDef(T2),
@@ -233,13 +305,53 @@ def max(
 
     This means reduction/broadcast/element cast semantics is explicit. Further
     passes can take that into account when lowering this code. For example,
-    a `linalg.broadcast` + `linalg.div` sequence can be lowered to a
+    a `linalg.broadcast` + `linalg.max` sequence can be lowered to a
     `linalg.generic` with different affine maps for the two operands.
     """
     O[None] = BinaryFn.max_signed(lhs[None], rhs[None])
 
 
 @linalg_structured_op
+def min(
+    lhs=TensorDef(T1),
+    rhs=TensorDef(T1),
+    O=TensorDef(T1, output=True),
+):
+    """Takes the min (signed) between two inputs, elementwise.
+
+    The shapes and element types must be identical. The appropriate casts,
+    broadcasts and reductions should be done previously to calling this op.
+
+    This means reduction/broadcast/element cast semantics is explicit. Further
+    passes can take that into account when lowering this code. For example,
+    a `linalg.broadcast` + `linalg.min` sequence can be lowered to a
+    `linalg.generic` with different affine maps for the two operands.
+    """
+    O[None] = BinaryFn.min_signed(lhs[None], rhs[None])
+
+
+@linalg_structured_op
+def powf(
+    lhs=TensorDef(T1),
+    rhs=TensorDef(T1),
+    O=TensorDef(T1, output=True),
+):
+    """Takes the powf(lhs, rhs) between two inputs, elementwise. For powf(arg, 2) use `linalg.square`.
+
+    Only applies to floating point values.
+
+    The shapes and element types must be identical. The appropriate casts,
+    broadcasts and reductions should be done previously to calling this op.
+
+    This means reduction/broadcast/element cast semantics is explicit. Further
+    passes can take that into account when lowering this code. For example,
+    a `linalg.broadcast` + `linalg.powf` sequence can be lowered to a
+    `linalg.generic` with different affine maps for the two operands.
+    """
+    O[None] = BinaryFn.powf(lhs[None], rhs[None])
+
+
+@linalg_structured_op
 def matmul(
     A=TensorDef(T1, S.M, S.K),
     B=TensorDef(T2, S.K, S.N),
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 1712d3d745b7..439f1e920e39 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -2517,7 +2517,7 @@ func.func @vector_interleave_1d(%a: vector<8xf32>, %b: vector<8xf32>) -> vector<
 //  CHECK-SAME:     %[[LHS:.*]]: vector<[4]xi32>, %[[RHS:.*]]: vector<[4]xi32>)
 func.func @vector_interleave_1d_scalable(%a: vector<[4]xi32>, %b: vector<[4]xi32>) -> vector<[8]xi32>
 {
-  // CHECK: %[[ZIP:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[LHS]], %[[RHS]]) : (vector<[4]xi32>, vector<[4]xi32>) -> vector<[8]xi32>
+  // CHECK: %[[ZIP:.*]] = "llvm.intr.vector.interleave2"(%[[LHS]], %[[RHS]]) : (vector<[4]xi32>, vector<[4]xi32>) -> vector<[8]xi32>
   // CHECK: return %[[ZIP]]
   %0 = vector.interleave %a, %b : vector<[4]xi32>
   return %0 : vector<[8]xi32>
@@ -2541,7 +2541,7 @@ func.func @vector_interleave_2d(%a: vector<2x3xi8>, %b: vector<2x3xi8>) -> vecto
 //  CHECK-SAME:     %[[LHS:.*]]: vector<2x[8]xi16>, %[[RHS:.*]]: vector<2x[8]xi16>)
 func.func @vector_interleave_2d_scalable(%a: vector<2x[8]xi16>, %b: vector<2x[8]xi16>) -> vector<2x[16]xi16>
 {
-  // CHECK: llvm.intr.experimental.vector.interleave2
+  // CHECK: llvm.intr.vector.interleave2
   // CHECK-NOT: vector.interleave {{.*}} : vector<2x[8]xi16>
   %0 = vector.interleave %a, %b : vector<2x[8]xi16>
   return %0 : vector<2x[16]xi16>
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index 79a318565e98..f7ce2123a93c 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -1223,6 +1223,28 @@ func.func @mulsiExtendedOneRhsSplat(%arg0: vector<3xi32>) -> (vector<3xi32>, vec
   return %low, %high : vector<3xi32>, vector<3xi32>
 }
 
+// CHECK-LABEL: @mulsiExtendedOneRhsI1
+//  CHECK-SAME:   (%[[ARG:.+]]: i1) -> (i1, i1)
+//  CHECK-NEXT:   %[[T:.+]]  = arith.constant true
+//  CHECK-NEXT:   %[[LOW:.+]], %[[HIGH:.+]] = arith.mulsi_extended %[[ARG]], %[[T]] : i1
+//  CHECK-NEXT:   return %[[LOW]], %[[HIGH]] : i1, i1
+func.func @mulsiExtendedOneRhsI1(%arg0: i1) -> (i1, i1) {
+  %one = arith.constant true
+  %low, %high = arith.mulsi_extended %arg0, %one: i1
+  return %low, %high : i1, i1
+}
+
+// CHECK-LABEL: @mulsiExtendedOneRhsSplatI1
+//  CHECK-SAME:   (%[[ARG:.+]]: vector<3xi1>) -> (vector<3xi1>, vector<3xi1>)
+//  CHECK-NEXT:   %[[TS:.+]]  = arith.constant dense<true> : vector<3xi1>
+//  CHECK-NEXT:   %[[LOW:.+]], %[[HIGH:.+]] = arith.mulsi_extended %[[ARG]], %[[TS]] : vector<3xi1>
+//  CHECK-NEXT:   return %[[LOW]], %[[HIGH]] : vector<3xi1>, vector<3xi1>
+func.func @mulsiExtendedOneRhsSplatI1(%arg0: vector<3xi1>) -> (vector<3xi1>, vector<3xi1>) {
+  %one = arith.constant dense<true> : vector<3xi1>
+  %low, %high = arith.mulsi_extended %arg0, %one: vector<3xi1>
+  return %low, %high : vector<3xi1>, vector<3xi1>
+}
+
 // CHECK-LABEL: @mulsiExtendedUnusedHigh
 //  CHECK-SAME:   (%[[ARG:.+]]: i32) -> i32
 //  CHECK-NEXT:   %[[RES:.+]] = arith.muli %[[ARG]], %[[ARG]] : i32
@@ -2809,6 +2831,87 @@ func.func @unsignedExtendConstantResource() -> tensor<i16> {
   return %ext : tensor<i16>
 }
 
+// CHECK-LABEL: @extsi_i0
+//       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i16
+//       CHECK:   return %[[ZERO]] : i16
+func.func @extsi_i0() -> i16 {
+  %c0 = arith.constant 0 : i0
+  %extsi = arith.extsi %c0 : i0 to i16
+  return %extsi : i16
+}
+
+// CHECK-LABEL: @extui_i0
+//       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i16
+//       CHECK:   return %[[ZERO]] : i16
+func.func @extui_i0() -> i16 {
+  %c0 = arith.constant 0 : i0
+  %extui = arith.extui %c0 : i0 to i16
+  return %extui : i16
+}
+
+// CHECK-LABEL: @trunc_i0
+//       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i0
+//       CHECK:   return %[[ZERO]] : i0
+func.func @trunc_i0() -> i0 {
+  %cFF = arith.constant 0xFF : i8
+  %trunc = arith.trunci %cFF : i8 to i0
+  return %trunc : i0
+}
+
+// CHECK-LABEL: @shli_i0
+//       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i0
+//       CHECK:   return %[[ZERO]] : i0
+func.func @shli_i0() -> i0 {
+  %c0 = arith.constant 0 : i0
+  %shli = arith.shli %c0, %c0 : i0
+  return %shli : i0
+}
+
+// CHECK-LABEL: @shrsi_i0
+//       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i0
+//       CHECK:   return %[[ZERO]] : i0
+func.func @shrsi_i0() -> i0 {
+  %c0 = arith.constant 0 : i0
+  %shrsi = arith.shrsi %c0, %c0 : i0
+  return %shrsi : i0
+}
+
+// CHECK-LABEL: @shrui_i0
+//       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i0
+//       CHECK:   return %[[ZERO]] : i0
+func.func @shrui_i0() -> i0 {
+  %c0 = arith.constant 0 : i0
+  %shrui = arith.shrui %c0, %c0 : i0
+  return %shrui : i0
+}
+
+// CHECK-LABEL: @maxsi_i0
+//       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i0
+//       CHECK:   return %[[ZERO]] : i0
+func.func @maxsi_i0() -> i0 {
+  %c0 = arith.constant 0 : i0
+  %maxsi = arith.maxsi %c0, %c0 : i0
+  return %maxsi : i0
+}
+
+// CHECK-LABEL: @minsi_i0
+//       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i0
+//       CHECK:   return %[[ZERO]] : i0
+func.func @minsi_i0() -> i0 {
+  %c0 = arith.constant 0 : i0
+  %minsi = arith.minsi %c0, %c0 : i0
+  return %minsi : i0
+}
+
+// CHECK-LABEL: @mulsi_extended_i0
+//       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i0
+//       CHECK:   return %[[ZERO]], %[[ZERO]] : i0
+func.func @mulsi_extended_i0() -> (i0, i0) {
+  %c0 = arith.constant 0 : i0
+  %mulsi_extended:2 = arith.mulsi_extended %c0, %c0 : i0
+  return %mulsi_extended#0, %mulsi_extended#1 : i0, i0
+}
+
 {-#
   dialect_resources: {
     builtin: {
diff --git a/mlir/test/Dialect/ArmSME/outer-product-fusion.mlir b/mlir/test/Dialect/ArmSME/outer-product-fusion.mlir
index de9de86003e6..01f54a4cf186 100644
--- a/mlir/test/Dialect/ArmSME/outer-product-fusion.mlir
+++ b/mlir/test/Dialect/ArmSME/outer-product-fusion.mlir
@@ -4,10 +4,10 @@
 // CHECK-SAME:    %[[A0:.*]]: vector<[4]xf16>, %[[B0:.*]]: vector<[4]xf16>, %[[A1:.*]]: vector<[4]xf16>, %[[B1:.*]]: vector<[4]xf16>,
 // CHECK-SAME:    %[[A0_MASK:.*]]: vector<[4]xi1>, %[[B0_MASK:.*]]: vector<[4]xi1>, %[[A1_MASK:.*]]: vector<[4]xi1>, %[[B1_MASK:.*]]: vector<[4]xi1>
 // CHECK-DAG: %[[ACC:.*]] = arith.constant dense<0.000000e+00> : vector<[4]x[4]xf32>
-// CHECK-DAG: %[[LHS:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[A0]], %[[A1]]) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
-// CHECK-DAG: %[[RHS:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[B0]], %[[B1]]) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
-// CHECK-DAG: %[[LHS_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[A0_MASK]], %[[A1_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
-// CHECK-DAG: %[[RHS_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[B0_MASK]], %[[B1_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
+// CHECK-DAG: %[[LHS:.*]] = "llvm.intr.vector.interleave2"(%[[A0]], %[[A1]]) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
+// CHECK-DAG: %[[RHS:.*]] = "llvm.intr.vector.interleave2"(%[[B0]], %[[B1]]) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
+// CHECK-DAG: %[[LHS_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[A0_MASK]], %[[A1_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
+// CHECK-DAG: %[[RHS_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[B0_MASK]], %[[B1_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
 // CHECK-DAG: arm_sme.fmopa_2way %[[LHS]], %[[RHS]] acc(%[[ACC]]) masks(%[[LHS_MASK]], %[[RHS_MASK]]) : vector<[8]xf16>, vector<[8]xf16> into vector<[4]x[4]xf32>
 func.func @outerproduct_add_widening_2way_f16f16f32(
     %a0 : vector<[4]xf16>, %b0 : vector<[4]xf16>,
@@ -225,18 +225,18 @@ func.func @outerproduct_sub_widening_2way_unsigned_i16i16i32(
 // CHECK-SAME:    %[[A2_MASK:[a-z0-9]+]]: vector<[4]xi1>, %[[B2_MASK:[a-z0-9]+]]: vector<[4]xi1>,
 // CHECK-SAME:    %[[A3_MASK:[a-z0-9]+]]: vector<[4]xi1>, %[[B3_MASK:[a-z0-9]+]]: vector<[4]xi1>
 // CHECK-DAG: %[[ACC:.*]] = arith.constant dense<0> : vector<[4]x[4]xi32>
-// CHECK-DAG: %[[LHS0:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[A0]], %[[A2]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
-// CHECK-DAG: %[[LHS1:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[A1]], %[[A3]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
-// CHECK-DAG: %[[RHS0:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[B0]], %[[B2]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
-// CHECK-DAG: %[[RHS1:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[B1]], %[[B3]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
-// CHECK-DAG: %[[LHS:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[LHS0]], %[[LHS1]]) : (vector<[8]xi8>, vector<[8]xi8>) -> vector<[16]xi8>
-// CHECK-DAG: %[[RHS:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[RHS0]], %[[RHS1]]) : (vector<[8]xi8>, vector<[8]xi8>) -> vector<[16]xi8>
-// CHECK-DAG: %[[LHS0_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[A0_MASK]], %[[A2_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
-// CHECK-DAG: %[[LHS1_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[A1_MASK]], %[[A3_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
-// CHECK-DAG: %[[RHS0_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[B0_MASK]], %[[B2_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
-// CHECK-DAG: %[[RHS1_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[B1_MASK]], %[[B3_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
-// CHECK-DAG: %[[LHS_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[LHS0_MASK]], %[[LHS1_MASK]]) : (vector<[8]xi1>, vector<[8]xi1>) -> vector<[16]xi1>
-// CHECK-DAG: %[[RHS_MASK:.*]] = "llvm.intr.experimental.vector.interleave2"(%[[RHS0_MASK]], %[[RHS1_MASK]]) : (vector<[8]xi1>, vector<[8]xi1>) -> vector<[16]xi1>
+// CHECK-DAG: %[[LHS0:.*]] = "llvm.intr.vector.interleave2"(%[[A0]], %[[A2]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
+// CHECK-DAG: %[[LHS1:.*]] = "llvm.intr.vector.interleave2"(%[[A1]], %[[A3]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
+// CHECK-DAG: %[[RHS0:.*]] = "llvm.intr.vector.interleave2"(%[[B0]], %[[B2]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
+// CHECK-DAG: %[[RHS1:.*]] = "llvm.intr.vector.interleave2"(%[[B1]], %[[B3]]) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
+// CHECK-DAG: %[[LHS:.*]] = "llvm.intr.vector.interleave2"(%[[LHS0]], %[[LHS1]]) : (vector<[8]xi8>, vector<[8]xi8>) -> vector<[16]xi8>
+// CHECK-DAG: %[[RHS:.*]] = "llvm.intr.vector.interleave2"(%[[RHS0]], %[[RHS1]]) : (vector<[8]xi8>, vector<[8]xi8>) -> vector<[16]xi8>
+// CHECK-DAG: %[[LHS0_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[A0_MASK]], %[[A2_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
+// CHECK-DAG: %[[LHS1_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[A1_MASK]], %[[A3_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
+// CHECK-DAG: %[[RHS0_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[B0_MASK]], %[[B2_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
+// CHECK-DAG: %[[RHS1_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[B1_MASK]], %[[B3_MASK]]) : (vector<[4]xi1>, vector<[4]xi1>) -> vector<[8]xi1>
+// CHECK-DAG: %[[LHS_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[LHS0_MASK]], %[[LHS1_MASK]]) : (vector<[8]xi1>, vector<[8]xi1>) -> vector<[16]xi1>
+// CHECK-DAG: %[[RHS_MASK:.*]] = "llvm.intr.vector.interleave2"(%[[RHS0_MASK]], %[[RHS1_MASK]]) : (vector<[8]xi1>, vector<[8]xi1>) -> vector<[16]xi1>
 // CHECK-DAG: arm_sme.smopa_4way %[[LHS]], %[[RHS]] acc(%[[ACC]]) masks(%[[LHS_MASK]], %[[RHS_MASK]]) : vector<[16]xi8>, vector<[16]xi8> into vector<[4]x[4]xi32>
 func.func @outerproduct_add_widening_4way_signed_i8i8i32(
     %a0 : vector<[4]xi8>, %b0 : vector<[4]xi8>,
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 601add9a9f91..5e4724c9d309 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -54,14 +54,43 @@ func.func @launch() {
 // CHECK-NEXT: %[[BDIM:.*]] = gpu.block_dim x
 // CHECK-NEXT: = gpu.block_dim y
 // CHECK-NEXT: = gpu.block_dim z
-// CHECK-NEXT: cf.br ^[[BLOCK:.*]]
-// CHECK-NEXT: ^[[BLOCK]]:
 // CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> ()
 // CHECK-NEXT: "some_op"(%[[BID]], %[[BDIM]]) : (index, index) -> ()
 // CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1>
 
 // -----
 
+// Verify that we can outline a CFG
+// CHECK-LABEL:  gpu.func @launchCFG_kernel(
+// CHECK: cf.br
+// CHECK: gpu.return
+func.func @launchCFG() {
+  %0 = "op"() : () -> (f32)
+  %1 = "op"() : () -> (memref<?xf32, 1>)
+  %gDimX = arith.constant 8 : index
+  %gDimY = arith.constant 12 : index
+  %gDimZ = arith.constant 16 : index
+  %bDimX = arith.constant 20 : index
+  %bDimY = arith.constant 24 : index
+  %bDimZ = arith.constant 28 : index
+
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %gDimX, %grid_y = %gDimY,
+                                       %grid_z = %gDimZ)
+             threads(%tx, %ty, %tz) in (%block_x = %bDimX, %block_y = %bDimY,
+                                        %block_z = %bDimZ) {
+    "use"(%0): (f32) -> ()
+    cf.br ^bb1
+  ^bb1:
+    "some_op"(%bx, %block_x) : (index, index) -> ()
+    %42 = memref.load %1[%tx] : memref<?xf32, 1>
+    gpu.terminator
+  }
+  return
+}
+
+
+// -----
+
 // This test checks gpu-out-lining can handle gpu.launch kernel from an llvm.func
 // CHECK-LABEL: @launch_from_llvm_func
 llvm.func @launch_from_llvm_func() {
@@ -475,8 +504,6 @@ func.func @launch_cluster() {
 // CHECK-NEXT: %[[CDIM:.*]] = gpu.cluster_dim x
 // CHECK-NEXT: = gpu.cluster_dim y
 // CHECK-NEXT: = gpu.cluster_dim z
-// CHECK-NEXT: cf.br ^[[BLOCK:.*]]
-// CHECK-NEXT: ^[[BLOCK]]:
 // CHECK-NEXT: "use"(%[[KERNEL_ARG0]]) : (f32) -> ()
 // CHECK-NEXT: "some_op"(%[[CID]], %[[BID]], %[[BDIM]]) : (index, index, index) -> ()
 // CHECK-NEXT: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<?xf32, 1>
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index de1ab9db8e8d..0914f0023210 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -1221,17 +1221,17 @@ func.func @extract_scalable_from_fixed_length_vector(%arg0 : vector<16xf32>) {
 
 // -----
 
-func.func @experimental_vector_interleave2_bad_type0(%vec1: vector<[2]xf16>, %vec2 : vector<[4]xf16>) {
+func.func @vector_interleave2_bad_type0(%vec1: vector<[2]xf16>, %vec2 : vector<[4]xf16>) {
   // expected-error@+1 {{op failed to verify that all of {vec1, vec2} have same type}}
-  %0 = "llvm.intr.experimental.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
+  %0 = "llvm.intr.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
   return
 }
 
 // -----
 
-func.func @experimental_vector_interleave2_bad_type1(%vec1: vector<[2]xf16>, %vec2 : vector<[2]xf16>) {
+func.func @vector_interleave2_bad_type1(%vec1: vector<[2]xf16>, %vec2 : vector<[2]xf16>) {
   // expected-error@+1 {{op failed to verify that result has twice as many elements as 'vec1'}}
-  %0 = "llvm.intr.experimental.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[2]xf16>) -> vector<[8]xf16>
+  %0 = "llvm.intr.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[2]xf16>) -> vector<[8]xf16>
   return
 }
 
@@ -1239,9 +1239,9 @@ func.func @experimental_vector_interleave2_bad_type1(%vec1: vector<[2]xf16>, %ve
 
 /// result vector type is not scalable.
 
-func.func @experimental_vector_interleave2_bad_type2(%vec1: vector<[2]xf16>, %vec2 : vector<[2]xf16>) {
+func.func @vector_interleave2_bad_type2(%vec1: vector<[2]xf16>, %vec2 : vector<[2]xf16>) {
   // expected-error@+1 {{op failed to verify that result has twice as many elements as 'vec1'}}
-  %0 = "llvm.intr.experimental.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[2]xf16>) -> vector<4xf16>
+  %0 = "llvm.intr.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[2]xf16>) -> vector<4xf16>
   return
 }
 
@@ -1250,9 +1250,9 @@ func.func @experimental_vector_interleave2_bad_type2(%vec1: vector<[2]xf16>, %ve
 
 /// element type doesn't match.
 
-func.func @experimental_vector_interleave2_bad_type3(%vec1: vector<[2]xf16>, %vec2 : vector<[2]xf16>) {
+func.func @vector_interleave2_bad_type3(%vec1: vector<[2]xf16>, %vec2 : vector<[2]xf16>) {
   // expected-error@+1 {{op failed to verify that result has twice as many elements as 'vec1'}}
-  %0 = "llvm.intr.experimental.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[2]xf16>) -> vector<[4]xf32>
+  %0 = "llvm.intr.vector.interleave2"(%vec1, %vec2) : (vector<[2]xf16>, vector<[2]xf16>) -> vector<[4]xf32>
   return
 }
 
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
index 31acf2b95e46..3b94db389f54 100644
--- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -342,10 +342,10 @@ func.func @mixed_vect(%arg0: vector<8xf32>, %arg1: vector<4xf32>, %arg2: vector<
   return
 }
 
-// CHECK-LABEL: @experimental_vector_interleave2
-func.func @experimental_vector_interleave2(%vec1: vector<[4]xf16>, %vec2 : vector<[4]xf16>) {
-  // CHECK: = "llvm.intr.experimental.vector.interleave2"({{.*}}) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
-  %0 = "llvm.intr.experimental.vector.interleave2"(%vec1, %vec2) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
+// CHECK-LABEL: @vector_interleave2
+func.func @vector_interleave2(%vec1: vector<[4]xf16>, %vec2 : vector<[4]xf16>) {
+  // CHECK: = "llvm.intr.vector.interleave2"({{.*}}) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
+  %0 = "llvm.intr.vector.interleave2"(%vec1, %vec2) : (vector<[4]xf16>, vector<[4]xf16>) -> vector<[8]xf16>
   return
 }
 
diff --git a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
index e852824cdb73..667ea3c18c8a 100644
--- a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
@@ -565,6 +565,155 @@ func.func @generalize_negf(%arg: memref<7x14x21xf32>, %out: memref<7x14x21xf32>)
 
 // -----
 
+func.func @generalize_reciprocal(%arg: memref<7x14x21xf32>, %out: memref<7x14x21xf32>) {
+  linalg.reciprocal ins(%arg : memref<7x14x21xf32>) outs(%out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK: func @generalize_reciprocal
+// CHECK-SAME: (%[[ARG:.+]]: memref<7x14x21xf32>, %[[OUT:.+]]: memref<7x14x21xf32>)
+
+// CHECK: %[[one:.+]] = arith.constant 1.000000e+00 : f32
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:  ins(%[[LHS]] : memref<7x14x21xf32>) outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32)
+// CHECK-NEXT:      %[[reciprocal:.+]] = arith.divf %[[one]], %[[BBARG0]] : f32
+// CHECK-NEXT:      linalg.yield %[[reciprocal]] : f32
+
+// -----
+
+func.func @generalize_round(%arg: memref<7x14x21xf32>, %out: memref<7x14x21xf32>) {
+  linalg.round ins(%arg : memref<7x14x21xf32>) outs(%out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK: func @generalize_round
+// CHECK-SAME: (%[[ARG:.+]]: memref<7x14x21xf32>, %[[OUT:.+]]: memref<7x14x21xf32>)
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:  ins(%[[LHS]] : memref<7x14x21xf32>) outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32)
+// CHECK-NEXT:      %[[round:.+]] = math.round %[[BBARG0]] : f32
+// CHECK-NEXT:      linalg.yield %[[round]] : f32
+
+// -----
+
+func.func @generalize_sqrt(%arg: memref<7x14x21xf32>, %out: memref<7x14x21xf32>) {
+  linalg.sqrt ins(%arg : memref<7x14x21xf32>) outs(%out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK: func @generalize_sqrt
+// CHECK-SAME: (%[[ARG:.+]]: memref<7x14x21xf32>, %[[OUT:.+]]: memref<7x14x21xf32>)
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:  ins(%[[LHS]] : memref<7x14x21xf32>) outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32)
+// CHECK-NEXT:      %[[sqrt:.+]] = math.sqrt %[[BBARG0]] : f32
+// CHECK-NEXT:      linalg.yield %[[sqrt]] : f32
+
+// -----
+
+func.func @generalize_rsqrt(%arg: memref<7x14x21xf32>, %out: memref<7x14x21xf32>) {
+  linalg.rsqrt ins(%arg : memref<7x14x21xf32>) outs(%out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK: func @generalize_rsqrt
+// CHECK-SAME: (%[[ARG:.+]]: memref<7x14x21xf32>, %[[OUT:.+]]: memref<7x14x21xf32>)
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:  ins(%[[LHS]] : memref<7x14x21xf32>) outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32)
+// CHECK-NEXT:      %[[rsqrt:.+]] = math.rsqrt %[[BBARG0]] : f32
+// CHECK-NEXT:      linalg.yield %[[rsqrt]] : f32
+
+// -----
+
+func.func @generalize_square(%arg: memref<7x14x21xf32>, %out: memref<7x14x21xf32>) {
+  linalg.square ins(%arg : memref<7x14x21xf32>) outs(%out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK: func @generalize_square
+// CHECK-SAME: (%[[ARG:.+]]: memref<7x14x21xf32>, %[[OUT:.+]]: memref<7x14x21xf32>)
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:  ins(%[[LHS]] : memref<7x14x21xf32>) outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32)
+// CHECK-NEXT:      %[[square:.+]] = arith.mulf %[[BBARG0]], %[[BBARG0]] : f32
+// CHECK-NEXT:      linalg.yield %[[square]] : f32
+
+// -----
+
+func.func @generalize_tanh(%arg: memref<7x14x21xf32>, %out: memref<7x14x21xf32>) {
+  linalg.tanh ins(%arg : memref<7x14x21xf32>) outs(%out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK: func @generalize_tanh
+// CHECK-SAME: (%[[ARG:.+]]: memref<7x14x21xf32>, %[[OUT:.+]]: memref<7x14x21xf32>)
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:  ins(%[[LHS]] : memref<7x14x21xf32>) outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32)
+// CHECK-NEXT:      %[[tanh:.+]] = math.tanh %[[BBARG0]] : f32
+// CHECK-NEXT:      linalg.yield %[[tanh]] : f32
+
+// -----
+
+func.func @generalize_erf(%arg: memref<7x14x21xf32>, %out: memref<7x14x21xf32>) {
+  linalg.erf ins(%arg : memref<7x14x21xf32>) outs(%out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK: func @generalize_erf
+// CHECK-SAME: (%[[ARG:.+]]: memref<7x14x21xf32>, %[[OUT:.+]]: memref<7x14x21xf32>)
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:  ins(%[[LHS]] : memref<7x14x21xf32>) outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32)
+// CHECK-NEXT:      %[[erf:.+]] = math.erf %[[BBARG0]] : f32
+// CHECK-NEXT:      linalg.yield %[[erf]] : f32
+
+// -----
+
 func.func @generalize_max(%lhs: memref<7x14x21xf32>, %rhs: memref<7x14x21xf32>,
                           %out: memref<7x14x21xf32>) {
   linalg.max ins(%lhs, %rhs : memref<7x14x21xf32>, memref<7x14x21xf32>)
@@ -590,6 +739,58 @@ func.func @generalize_max(%lhs: memref<7x14x21xf32>, %rhs: memref<7x14x21xf32>,
 
 // -----
 
+func.func @generalize_min(%lhs: memref<7x14x21xf32>, %rhs: memref<7x14x21xf32>,
+                          %out: memref<7x14x21xf32>) {
+  linalg.min ins(%lhs, %rhs : memref<7x14x21xf32>, memref<7x14x21xf32>)
+             outs(%out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK: func @generalize_min
+// CHECK-SAME: (%[[LHS:.+]]: memref<7x14x21xf32>, %[[RHS:.+]]: memref<7x14x21xf32>,
+// CHECK-SAME:  %[[OUT:.+]]: memref<7x14x21xf32>)
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]], #[[MAP]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:  ins(%[[LHS]], %[[RHS]] : memref<7x14x21xf32>, memref<7x14x21xf32>)
+// CHECK-SAME: outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32, %[[BBARG2:.+]]: f32)
+// CHECK-NEXT:      %[[min:.+]] = arith.minimumf %[[BBARG0]], %[[BBARG1]] : f32
+// CHECK-NEXT:      linalg.yield %[[min]] : f32
+
+
+// -----
+
+func.func @generalize_powf(%lhs: memref<7x14x21xf32>, %rhs: memref<7x14x21xf32>,
+                          %out: memref<7x14x21xf32>) {
+  linalg.powf ins(%lhs, %rhs : memref<7x14x21xf32>, memref<7x14x21xf32>)
+             outs(%out : memref<7x14x21xf32>)
+  return
+}
+
+// CHECK: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+// CHECK: func @generalize_powf
+// CHECK-SAME: (%[[LHS:.+]]: memref<7x14x21xf32>, %[[RHS:.+]]: memref<7x14x21xf32>,
+// CHECK-SAME:  %[[OUT:.+]]: memref<7x14x21xf32>)
+
+// CHECK: linalg.generic
+// CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP]], #[[MAP]]]
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:  ins(%[[LHS]], %[[RHS]] : memref<7x14x21xf32>, memref<7x14x21xf32>)
+// CHECK-SAME: outs(%[[OUT]] : memref<7x14x21xf32>)
+
+// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32, %[[BBARG2:.+]]: f32)
+// CHECK-NEXT:      %[[powf:.+]] = math.powf %[[BBARG0]], %[[BBARG1]] : f32
+// CHECK-NEXT:      linalg.yield %[[powf]] : f32
+
+
+// -----
+
 
 // CHECK-LABEL: func @fill_tensor
 func.func @fill_tensor(%f: f32, %v: vector<2x4xf32>) -> (tensor<f32>, tensor<vector<2x4xf32>>) {
diff --git a/mlir/test/Dialect/Linalg/named-ops-fail.mlir b/mlir/test/Dialect/Linalg/named-ops-fail.mlir
index c351e139a97e..e92a77aa7ad0 100644
--- a/mlir/test/Dialect/Linalg/named-ops-fail.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops-fail.mlir
@@ -176,6 +176,118 @@ func.func @negf_broadcast(%arg: memref<8x16xf32>, %out: memref<4x8x16xf32>) {
 
 // -----
 
+func.func @reciprocal_type_cast(%arg: memref<4x8x16xf16>, %out: memref<4x8x16xf32>) {
+  // CHECK: operand 1 ('f16') doesn't match the element type of the enclosing linalg.generic op ('f32')
+  linalg.reciprocal ins(%arg : memref<4x8x16xf16>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @reciprocal_broadcast(%arg: memref<8x16xf32>, %out: memref<4x8x16xf32>) {
+  // CHECK: op expected operand rank (2) to match the result rank of indexing_map #0 (3)
+  linalg.reciprocal ins(%arg : memref<8x16xf32>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @round_type_cast(%arg: memref<4x8x16xf16>, %out: memref<4x8x16xf32>) {
+  // CHECK: operand 1 ('f16') doesn't match the element type of the enclosing linalg.generic op ('f32')
+  linalg.round ins(%arg : memref<4x8x16xf16>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @round_broadcast(%arg: memref<8x16xf32>, %out: memref<4x8x16xf32>) {
+  // CHECK: op expected operand rank (2) to match the result rank of indexing_map #0 (3)
+  linalg.round ins(%arg : memref<8x16xf32>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @sqrt_type_cast(%arg: memref<4x8x16xf16>, %out: memref<4x8x16xf32>) {
+  // CHECK: operand 1 ('f16') doesn't match the element type of the enclosing linalg.generic op ('f32')
+  linalg.sqrt ins(%arg : memref<4x8x16xf16>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @sqrt_broadcast(%arg: memref<8x16xf32>, %out: memref<4x8x16xf32>) {
+  // CHECK: op expected operand rank (2) to match the result rank of indexing_map #0 (3)
+  linalg.sqrt ins(%arg : memref<8x16xf32>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @rsqrt_type_cast(%arg: memref<4x8x16xf16>, %out: memref<4x8x16xf32>) {
+  // CHECK: operand 1 ('f16') doesn't match the element type of the enclosing linalg.generic op ('f32')
+  linalg.rsqrt ins(%arg : memref<4x8x16xf16>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @rsqrt_broadcast(%arg: memref<8x16xf32>, %out: memref<4x8x16xf32>) {
+  // CHECK: op expected operand rank (2) to match the result rank of indexing_map #0 (3)
+  linalg.rsqrt ins(%arg : memref<8x16xf32>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @square_type_cast(%arg: memref<4x8x16xf16>, %out: memref<4x8x16xf32>) {
+  // CHECK: operand 1 ('f16') doesn't match the element type of the enclosing linalg.generic op ('f32')
+  linalg.square ins(%arg : memref<4x8x16xf16>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @square_broadcast(%arg: memref<8x16xf32>, %out: memref<4x8x16xf32>) {
+  // CHECK: op expected operand rank (2) to match the result rank of indexing_map #0 (3)
+  linalg.square ins(%arg : memref<8x16xf32>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @tanh_type_cast(%arg: memref<4x8x16xf16>, %out: memref<4x8x16xf32>) {
+  // CHECK: operand 1 ('f16') doesn't match the element type of the enclosing linalg.generic op ('f32')
+  linalg.tanh ins(%arg : memref<4x8x16xf16>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @tanh_broadcast(%arg: memref<8x16xf32>, %out: memref<4x8x16xf32>) {
+  // CHECK: op expected operand rank (2) to match the result rank of indexing_map #0 (3)
+  linalg.tanh ins(%arg : memref<8x16xf32>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @erf_type_cast(%arg: memref<4x8x16xf16>, %out: memref<4x8x16xf32>) {
+  // CHECK: operand 1 ('f16') doesn't match the element type of the enclosing linalg.generic op ('f32')
+  linalg.erf ins(%arg : memref<4x8x16xf16>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @erf_broadcast(%arg: memref<8x16xf32>, %out: memref<4x8x16xf32>) {
+  // CHECK: op expected operand rank (2) to match the result rank of indexing_map #0 (3)
+  linalg.erf ins(%arg : memref<8x16xf32>) outs(%out: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
 func.func @max_type_cast(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf16>, %arg2: memref<4x8x16xf32>) {
   // CHECK: op requires the same type for all operands and results
   linalg.max ins(%arg0, %arg1 : memref<4x8x16xf32>, memref<4x8x16xf16>) outs(%arg2: memref<4x8x16xf32>)
@@ -189,3 +301,36 @@ func.func @max_broadcast(%arg0: memref<8x16xf32>, %arg1: memref<4x8x16xf32>, %ar
   linalg.max ins(%arg0, %arg1 : memref<8x16xf32>, memref<4x8x16xf32>) outs(%arg2: memref<4x8x16xf32>)
   return
 }
+
+// -----
+
+func.func @min_type_cast(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf16>, %arg2: memref<4x8x16xf32>) {
+  // CHECK: op requires the same type for all operands and results
+  linalg.min ins(%arg0, %arg1 : memref<4x8x16xf32>, memref<4x8x16xf16>) outs(%arg2: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @min_broadcast(%arg0: memref<8x16xf32>, %arg1: memref<4x8x16xf32>, %arg2: memref<4x8x16xf32>) {
+  // CHECK: op expected operand rank (2) to match the result rank of indexing_map #0 (3)
+  linalg.min ins(%arg0, %arg1 : memref<8x16xf32>, memref<4x8x16xf32>) outs(%arg2: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @powf_type_cast(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf16>, %arg2: memref<4x8x16xf32>) {
+  // CHECK: op requires the same type for all operands and results
+  linalg.powf ins(%arg0, %arg1 : memref<4x8x16xf32>, memref<4x8x16xf16>) outs(%arg2: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+func.func @powf_broadcast(%arg0: memref<8x16xf32>, %arg1: memref<4x8x16xf32>, %arg2: memref<4x8x16xf32>) {
+  // CHECK: op expected operand rank (2) to match the result rank of indexing_map #0 (3)
+  linalg.powf ins(%arg0, %arg1 : memref<8x16xf32>, memref<4x8x16xf32>) outs(%arg2: memref<4x8x16xf32>)
+  return
+}
+
diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir
index 7064e1b3f9dc..fefe5578947f 100644
--- a/mlir/test/Dialect/Linalg/named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops.mlir
@@ -1597,6 +1597,223 @@ func.func @negf_tensor(%arg0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
 
 // -----
 
+// CHECK-LABEL: func @reciprocal_dynamic
+func.func @reciprocal_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>) {
+  // CHECK: linalg.reciprocal
+  // CHECK-SAME: ins(%{{.+}} : memref<?x?x?xf32>) outs(%{{.+}} : memref<?x?x?xf32>)
+  linalg.reciprocal ins(%arg0 : memref<?x?x?xf32>) outs(%arg1: memref<?x?x?xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @reciprocal_static
+func.func @reciprocal_static(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf32>) {
+  // CHECK: linalg.reciprocal
+  // CHECK-SAME: ins(%{{.+}} : memref<4x8x16xf32>) outs(%{{.+}} : memref<4x8x16xf32>)
+  linalg.reciprocal ins(%arg0 : memref<4x8x16xf32>) outs(%arg1: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @reciprocal_tensor
+func.func @reciprocal_tensor(%arg0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
+  %0 = tensor.empty() : tensor<4x8x16xf32>
+  // CHECK: linalg.reciprocal
+  // CHECK-SAME: ins(%{{.+}} : tensor<4x8x16xf32>) outs(%{{.+}} : tensor<4x8x16xf32>)
+  %1 = linalg.reciprocal ins(%arg0 : tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %1 : tensor<4x8x16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @round_dynamic
+func.func @round_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>) {
+  // CHECK: linalg.round
+  // CHECK-SAME: ins(%{{.+}} : memref<?x?x?xf32>) outs(%{{.+}} : memref<?x?x?xf32>)
+  linalg.round ins(%arg0 : memref<?x?x?xf32>) outs(%arg1: memref<?x?x?xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @round_static
+func.func @round_static(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf32>) {
+  // CHECK: linalg.round
+  // CHECK-SAME: ins(%{{.+}} : memref<4x8x16xf32>) outs(%{{.+}} : memref<4x8x16xf32>)
+  linalg.round ins(%arg0 : memref<4x8x16xf32>) outs(%arg1: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @round_tensor
+func.func @round_tensor(%arg0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
+  %0 = tensor.empty() : tensor<4x8x16xf32>
+  // CHECK: linalg.round
+  // CHECK-SAME: ins(%{{.+}} : tensor<4x8x16xf32>) outs(%{{.+}} : tensor<4x8x16xf32>)
+  %1 = linalg.round ins(%arg0 : tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %1 : tensor<4x8x16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @sqrt_dynamic
+func.func @sqrt_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>) {
+  // CHECK: linalg.sqrt
+  // CHECK-SAME: ins(%{{.+}} : memref<?x?x?xf32>) outs(%{{.+}} : memref<?x?x?xf32>)
+  linalg.sqrt ins(%arg0 : memref<?x?x?xf32>) outs(%arg1: memref<?x?x?xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @sqrt_static
+func.func @sqrt_static(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf32>) {
+  // CHECK: linalg.sqrt
+  // CHECK-SAME: ins(%{{.+}} : memref<4x8x16xf32>) outs(%{{.+}} : memref<4x8x16xf32>)
+  linalg.sqrt ins(%arg0 : memref<4x8x16xf32>) outs(%arg1: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @sqrt_tensor
+func.func @sqrt_tensor(%arg0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
+  %0 = tensor.empty() : tensor<4x8x16xf32>
+  // CHECK: linalg.sqrt
+  // CHECK-SAME: ins(%{{.+}} : tensor<4x8x16xf32>) outs(%{{.+}} : tensor<4x8x16xf32>)
+  %1 = linalg.sqrt ins(%arg0 : tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %1 : tensor<4x8x16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @rsqrt_dynamic
+func.func @rsqrt_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>) {
+  // CHECK: linalg.rsqrt
+  // CHECK-SAME: ins(%{{.+}} : memref<?x?x?xf32>) outs(%{{.+}} : memref<?x?x?xf32>)
+  linalg.rsqrt ins(%arg0 : memref<?x?x?xf32>) outs(%arg1: memref<?x?x?xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @rsqrt_static
+func.func @rsqrt_static(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf32>) {
+  // CHECK: linalg.rsqrt
+  // CHECK-SAME: ins(%{{.+}} : memref<4x8x16xf32>) outs(%{{.+}} : memref<4x8x16xf32>)
+  linalg.rsqrt ins(%arg0 : memref<4x8x16xf32>) outs(%arg1: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @rsqrt_tensor
+func.func @rsqrt_tensor(%arg0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
+  %0 = tensor.empty() : tensor<4x8x16xf32>
+  // CHECK: linalg.rsqrt
+  // CHECK-SAME: ins(%{{.+}} : tensor<4x8x16xf32>) outs(%{{.+}} : tensor<4x8x16xf32>)
+  %1 = linalg.rsqrt ins(%arg0 : tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %1 : tensor<4x8x16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @square_dynamic
+func.func @square_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>) {
+  // CHECK: linalg.square
+  // CHECK-SAME: ins(%{{.+}} : memref<?x?x?xf32>) outs(%{{.+}} : memref<?x?x?xf32>)
+  linalg.square ins(%arg0 : memref<?x?x?xf32>) outs(%arg1: memref<?x?x?xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @square_static
+func.func @square_static(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf32>) {
+  // CHECK: linalg.square
+  // CHECK-SAME: ins(%{{.+}} : memref<4x8x16xf32>) outs(%{{.+}} : memref<4x8x16xf32>)
+  linalg.square ins(%arg0 : memref<4x8x16xf32>) outs(%arg1: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @square_tensor
+func.func @square_tensor(%arg0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
+  %0 = tensor.empty() : tensor<4x8x16xf32>
+  // CHECK: linalg.square
+  // CHECK-SAME: ins(%{{.+}} : tensor<4x8x16xf32>) outs(%{{.+}} : tensor<4x8x16xf32>)
+  %1 = linalg.square ins(%arg0 : tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %1 : tensor<4x8x16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @tanh_dynamic
+func.func @tanh_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>) {
+  // CHECK: linalg.tanh
+  // CHECK-SAME: ins(%{{.+}} : memref<?x?x?xf32>) outs(%{{.+}} : memref<?x?x?xf32>)
+  linalg.tanh ins(%arg0 : memref<?x?x?xf32>) outs(%arg1: memref<?x?x?xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @tanh_static
+func.func @tanh_static(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf32>) {
+  // CHECK: linalg.tanh
+  // CHECK-SAME: ins(%{{.+}} : memref<4x8x16xf32>) outs(%{{.+}} : memref<4x8x16xf32>)
+  linalg.tanh ins(%arg0 : memref<4x8x16xf32>) outs(%arg1: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @tanh_tensor
+func.func @tanh_tensor(%arg0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
+  %0 = tensor.empty() : tensor<4x8x16xf32>
+  // CHECK: linalg.tanh
+  // CHECK-SAME: ins(%{{.+}} : tensor<4x8x16xf32>) outs(%{{.+}} : tensor<4x8x16xf32>)
+  %1 = linalg.tanh ins(%arg0 : tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %1 : tensor<4x8x16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @erf_dynamic
+func.func @erf_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>) {
+  // CHECK: linalg.erf
+  // CHECK-SAME: ins(%{{.+}} : memref<?x?x?xf32>) outs(%{{.+}} : memref<?x?x?xf32>)
+  linalg.erf ins(%arg0 : memref<?x?x?xf32>) outs(%arg1: memref<?x?x?xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @erf_static
+func.func @erf_static(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf32>) {
+  // CHECK: linalg.erf
+  // CHECK-SAME: ins(%{{.+}} : memref<4x8x16xf32>) outs(%{{.+}} : memref<4x8x16xf32>)
+  linalg.erf ins(%arg0 : memref<4x8x16xf32>) outs(%arg1: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @erf_tensor
+func.func @erf_tensor(%arg0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
+  %0 = tensor.empty() : tensor<4x8x16xf32>
+  // CHECK: linalg.erf
+  // CHECK-SAME: ins(%{{.+}} : tensor<4x8x16xf32>) outs(%{{.+}} : tensor<4x8x16xf32>)
+  %1 = linalg.erf ins(%arg0 : tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %1 : tensor<4x8x16xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @max_dynamic
 func.func @max_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>, %arg2: memref<?x?x?xf32>) {
   // CHECK: linalg.max
@@ -1631,6 +1848,74 @@ func.func @max_tensor(%arg0: tensor<4x8x16xf32>, %arg1: tensor<4x8x16xf32>) -> t
 
 // -----
 
+// CHECK-LABEL: func @min_dynamic
+func.func @min_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>, %arg2: memref<?x?x?xf32>) {
+  // CHECK: linalg.min
+  // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<?x?x?xf32>, memref<?x?x?xf32>)
+  // CHECK-SAME: outs(%{{.+}} : memref<?x?x?xf32>)
+  linalg.min ins(%arg0, %arg1 : memref<?x?x?xf32>, memref<?x?x?xf32>) outs(%arg2: memref<?x?x?xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @min_static
+func.func @min_static(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf32>, %arg2: memref<4x8x16xf32>) {
+  // CHECK: linalg.min
+  // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<4x8x16xf32>, memref<4x8x16xf32>)
+  // CHECK-SAME: outs(%{{.+}} : memref<4x8x16xf32>)
+  linalg.min ins(%arg0, %arg1 : memref<4x8x16xf32>, memref<4x8x16xf32>) outs(%arg2: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @min_tensor
+func.func @min_tensor(%arg0: tensor<4x8x16xf32>, %arg1: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
+  %0 = tensor.empty() : tensor<4x8x16xf32>
+  // CHECK: linalg.min
+  // CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor<4x8x16xf32>, tensor<4x8x16xf32>)
+  // CHECK-SAME: outs(%{{.+}} : tensor<4x8x16xf32>)
+  %1 = linalg.min ins(%arg0, %arg1 : tensor<4x8x16xf32>, tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %1 : tensor<4x8x16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @powf_dynamic
+func.func @powf_dynamic(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>, %arg2: memref<?x?x?xf32>) {
+  // CHECK: linalg.powf
+  // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<?x?x?xf32>, memref<?x?x?xf32>)
+  // CHECK-SAME: outs(%{{.+}} : memref<?x?x?xf32>)
+  linalg.powf ins(%arg0, %arg1 : memref<?x?x?xf32>, memref<?x?x?xf32>) outs(%arg2: memref<?x?x?xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @powf_static
+func.func @powf_static(%arg0: memref<4x8x16xf32>, %arg1: memref<4x8x16xf32>, %arg2: memref<4x8x16xf32>) {
+  // CHECK: linalg.powf
+  // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<4x8x16xf32>, memref<4x8x16xf32>)
+  // CHECK-SAME: outs(%{{.+}} : memref<4x8x16xf32>)
+  linalg.powf ins(%arg0, %arg1 : memref<4x8x16xf32>, memref<4x8x16xf32>) outs(%arg2: memref<4x8x16xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @powf_tensor
+func.func @powf_tensor(%arg0: tensor<4x8x16xf32>, %arg1: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> {
+  %0 = tensor.empty() : tensor<4x8x16xf32>
+  // CHECK: linalg.powf
+  // CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor<4x8x16xf32>, tensor<4x8x16xf32>)
+  // CHECK-SAME: outs(%{{.+}} : tensor<4x8x16xf32>)
+  %1 = linalg.powf ins(%arg0, %arg1 : tensor<4x8x16xf32>, tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32>
+  return %1 : tensor<4x8x16xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @fill_tensor
 func.func @fill_tensor(%f: f32, %v: vector<2x4xf32>) -> (tensor<f32>, tensor<vector<2x4xf32>>) {
   %e0 = tensor.empty() : tensor<f32>
diff --git a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir
index fd37b7ff0a27..435dcc944778 100644
--- a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir
+++ b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir
@@ -430,3 +430,23 @@ func.func @rank_zero_memref_store(%arg0: i4) -> () {
 //       CHECK32:   %[[EXTUI:.+]] = arith.extui %[[ARG0]] : i4 to i32
 //       CHECK32:   %[[WRITE_RMW:.+]] = memref.atomic_rmw assign %[[EXTUI]], %[[ALLOC]][] : (i32, memref<i32>) -> i32
 //       CHECK32:   return
+
+// -----
+
+func.func @memref_collapse_shape_i4(%idx0 : index, %idx1 : index) -> i4 {
+  %arr = memref.alloc() : memref<32x8x128xi4>
+  %collapse = memref.collapse_shape %arr[[0, 1], [2]] : memref<32x8x128xi4> into memref<256x128xi4>
+  %1 = memref.load %collapse[%idx0, %idx1] : memref<256x128xi4>
+  return %1 : i4
+}
+
+// CHECK-LABEL:   func.func @memref_collapse_shape_i4(
+//       CHECK:     %[[ALLOC:.*]] = memref.alloc() : memref<16384xi8>
+//   CHECK-NOT:     memref.collapse_shape
+//       CHECK:     memref.load %[[ALLOC]][%{{.*}}] : memref<16384xi8>
+
+// CHECK32-LABEL:   func.func @memref_collapse_shape_i4(
+//       CHECK32:     %[[ALLOC:.*]] = memref.alloc() : memref<4096xi32>
+//   CHECK32-NOT:     memref.collapse_shape
+//       CHECK32:     memref.load %[[ALLOC]][%{{.*}}] : memref<4096xi32>
+
diff --git a/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir b/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir
index 28b700430059..0705b30ca45d 100644
--- a/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir
+++ b/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir
@@ -1513,4 +1513,26 @@ func.func @zero_sized_memred(%arg0: f32) -> (memref<f16, 3>, index,index,index)
     %sizes, %strides :
       memref<f16,3>, index,
       index, index
-}
-\ No newline at end of file
+}
+
+// -----
+
+func.func @extract_strided_metadata_of_collapse_shape(%base: memref<5x4xf32>)
+    -> (memref<f32>, index, index, index) {
+
+  %collapse = memref.collapse_shape %base[[0, 1]] :
+    memref<5x4xf32> into memref<20xf32>
+
+  %base_buffer, %offset, %size, %stride = memref.extract_strided_metadata %collapse :
+    memref<20xf32> -> memref<f32>, index, index, index
+
+  return %base_buffer, %offset, %size, %stride :
+    memref<f32>, index, index, index
+}
+
+// CHECK-LABEL:  func @extract_strided_metadata_of_collapse_shape
+//   CHECK-DAG:    %[[OFFSET:.*]] = arith.constant 0 : index
+//   CHECK-DAG:    %[[SIZE:.*]] = arith.constant 20 : index
+//   CHECK-DAG:    %[[STEP:.*]] = arith.constant 1 : index
+//       CHECK:    %[[BASE:.*]], %{{.*}}, %{{.*}}, %{{.*}} = memref.extract_strided_metadata
+//       CHECK:    return %[[BASE]], %[[OFFSET]], %[[SIZE]], %[[STEP]] : memref<f32>, index, index, index
diff --git a/mlir/test/Dialect/Polynomial/ops.mlir b/mlir/test/Dialect/Polynomial/ops.mlir
new file mode 100644
index 000000000000..ea1b279fa1ff
--- /dev/null
+++ b/mlir/test/Dialect/Polynomial/ops.mlir
@@ -0,0 +1,82 @@
+// RUN: mlir-opt %s | FileCheck %s
+
+// This simply tests for syntax.
+
+#my_poly = #polynomial.polynomial<1 + x**1024>
+#my_poly_2 = #polynomial.polynomial<2>
+#my_poly_3 = #polynomial.polynomial<3x>
+#my_poly_4 = #polynomial.polynomial<t**3 + 4t + 2>
+#ring1 = #polynomial.ring<coefficientType=i32, coefficientModulus=2837465, polynomialModulus=#my_poly>
+#one_plus_x_squared = #polynomial.polynomial<1 + x**2>
+
+#ideal = #polynomial.polynomial<-1 + x**1024>
+#ring = #polynomial.ring<coefficientType=i32, coefficientModulus=18, polynomialModulus=#ideal>
+!poly_ty = !polynomial.polynomial<#ring>
+
+module {
+  func.func @test_multiply() -> !polynomial.polynomial<#ring1> {
+    %c0 = arith.constant 0 : index
+    %two = arith.constant 2 : i16
+    %five = arith.constant 5 : i16
+    %coeffs1 = tensor.from_elements %two, %two, %five : tensor<3xi16>
+    %coeffs2 = tensor.from_elements %five, %five, %two : tensor<3xi16>
+
+    %poly1 = polynomial.from_tensor %coeffs1 : tensor<3xi16> -> !polynomial.polynomial<#ring1>
+    %poly2 = polynomial.from_tensor %coeffs2 : tensor<3xi16> -> !polynomial.polynomial<#ring1>
+
+    %3 = polynomial.mul %poly1, %poly2 : !polynomial.polynomial<#ring1>
+
+    return %3 : !polynomial.polynomial<#ring1>
+  }
+
+  func.func @test_elementwise(%p0 : !polynomial.polynomial<#ring1>, %p1: !polynomial.polynomial<#ring1>) {
+    %tp0 = tensor.from_elements %p0, %p1 : tensor<2x!polynomial.polynomial<#ring1>>
+    %tp1 = tensor.from_elements %p1, %p0 : tensor<2x!polynomial.polynomial<#ring1>>
+
+    %c = arith.constant 2 : i32
+    %mul_const_sclr = polynomial.mul_scalar %tp0, %c : tensor<2x!polynomial.polynomial<#ring1>>, i32
+
+    %add = polynomial.add %tp0, %tp1 : tensor<2x!polynomial.polynomial<#ring1>>
+    %sub = polynomial.sub %tp0, %tp1 : tensor<2x!polynomial.polynomial<#ring1>>
+    %mul = polynomial.mul %tp0, %tp1 : tensor<2x!polynomial.polynomial<#ring1>>
+
+    return
+  }
+
+  func.func @test_to_from_tensor(%p0 : !polynomial.polynomial<#ring1>) {
+    %c0 = arith.constant 0 : index
+    %two = arith.constant 2 : i16
+    %coeffs1 = tensor.from_elements %two, %two : tensor<2xi16>
+    // CHECK: from_tensor
+    %poly = polynomial.from_tensor %coeffs1 : tensor<2xi16> -> !polynomial.polynomial<#ring1>
+    // CHECK: to_tensor
+    %tensor = polynomial.to_tensor %poly : !polynomial.polynomial<#ring1> -> tensor<1024xi16>
+
+    return
+  }
+
+  func.func @test_degree(%p0 : !polynomial.polynomial<#ring1>) {
+    %0, %1 = polynomial.leading_term %p0 : !polynomial.polynomial<#ring1> -> (index, i32)
+    return
+  }
+
+  func.func @test_monomial() {
+    %deg = arith.constant 1023 : index
+    %five = arith.constant 5 : i16
+    %0 = polynomial.monomial %five, %deg : (i16, index) -> !polynomial.polynomial<#ring1>
+    return
+  }
+
+  func.func @test_monic_monomial_mul() {
+    %five = arith.constant 5 : index
+    %0 = polynomial.constant #one_plus_x_squared : !polynomial.polynomial<#ring1>
+    %1 = polynomial.monic_monomial_mul %0, %five : (!polynomial.polynomial<#ring1>, index) -> !polynomial.polynomial<#ring1>
+    return
+  }
+
+  func.func @test_constant() {
+    %0 = polynomial.constant #one_plus_x_squared : !polynomial.polynomial<#ring1>
+    %1 = polynomial.constant <1 + x**2> : !polynomial.polynomial<#ring1>
+    return
+  }
+}
diff --git a/mlir/test/Dialect/Polynomial/ops_errors.mlir b/mlir/test/Dialect/Polynomial/ops_errors.mlir
new file mode 100644
index 000000000000..c34a7de30e5f
--- /dev/null
+++ b/mlir/test/Dialect/Polynomial/ops_errors.mlir
@@ -0,0 +1,53 @@
+// RUN: mlir-opt --split-input-file --verify-diagnostics %s
+
+#my_poly = #polynomial.polynomial<1 + x**1024>
+#ring = #polynomial.ring<coefficientType=i16, coefficientModulus=256, polynomialModulus=#my_poly>
+!ty = !polynomial.polynomial<#ring>
+
+func.func @test_from_tensor_too_large_coeffs() {
+  %two = arith.constant 2 : i32
+  %coeffs1 = tensor.from_elements %two, %two : tensor<2xi32>
+  // expected-error@below {{is too large to fit in the coefficients}}
+  // expected-note@below {{rescaled to fit}}
+  %poly = polynomial.from_tensor %coeffs1 : tensor<2xi32> -> !ty
+  return
+}
+
+// -----
+
+#my_poly = #polynomial.polynomial<1 + x**4>
+#ring = #polynomial.ring<coefficientType=i32, coefficientModulus=256, polynomialModulus=#my_poly>
+!ty = !polynomial.polynomial<#ring>
+func.func @test_from_tensor_wrong_tensor_type() {
+  %two = arith.constant 2 : i32
+  %coeffs1 = tensor.from_elements %two, %two, %two, %two, %two : tensor<5xi32>
+  // expected-error@below {{input type 'tensor<5xi32>' does not match output type '!polynomial.polynomial<#polynomial.ring<coefficientType=i32, coefficientModulus=256 : i32, polynomialModulus=#polynomial.polynomial<1 + x**4>>>'}}
+  // expected-note@below {{at most the degree of the polynomialModulus of the output type's ring attribute}}
+  %poly = polynomial.from_tensor %coeffs1 : tensor<5xi32> -> !ty
+  return
+}
+
+// -----
+
+#my_poly = #polynomial.polynomial<1 + x**4>
+#ring = #polynomial.ring<coefficientType=i32, coefficientModulus=256, polynomialModulus=#my_poly>
+!ty = !polynomial.polynomial<#ring>
+func.func @test_to_tensor_wrong_output_tensor_type(%arg0 : !ty) {
+  // expected-error@below {{input type '!polynomial.polynomial<#polynomial.ring<coefficientType=i32, coefficientModulus=256 : i32, polynomialModulus=#polynomial.polynomial<1 + x**4>>>' does not match output type 'tensor<5xi32>'}}
+  // expected-note@below {{at most the degree of the polynomialModulus of the input type's ring attribute}}
+  %tensor = polynomial.to_tensor %arg0 : !ty -> tensor<5xi32>
+  return
+}
+
+// -----
+
+#my_poly = #polynomial.polynomial<1 + x**1024>
+#ring = #polynomial.ring<coefficientType=i16, coefficientModulus=256, polynomialModulus=#my_poly>
+!ty = !polynomial.polynomial<#ring>
+
+func.func @test_mul_scalar_wrong_type(%arg0: !ty) -> !ty {
+  %scalar = arith.constant 2 : i32  // should be i16
+  // expected-error@below {{polynomial coefficient type 'i16' does not match scalar type 'i32'}}
+  %poly = polynomial.mul_scalar %arg0, %scalar : !ty, i32
+  return %poly : !ty
+}
diff --git a/mlir/test/Dialect/Polynomial/types.mlir b/mlir/test/Dialect/Polynomial/types.mlir
index 64b74d9d36bb..00296a36e890 100644
--- a/mlir/test/Dialect/Polynomial/types.mlir
+++ b/mlir/test/Dialect/Polynomial/types.mlir
@@ -40,3 +40,17 @@ func.func @test_non_x_variable_64_bit(%0: !ty2) -> !ty2 {
 func.func @test_linear_poly(%0: !ty3) -> !ty3 {
   return %0 : !ty3
 }
+
+// CHECK-LABEL: func @test_negative_leading_1
+// CHECK-SAME:  !polynomial.polynomial<
+// CHECK-SAME:    #polynomial.ring<
+// CHECK-SAME:       coefficientType=i32,
+// CHECK-SAME:       coefficientModulus=2837465 : i32,
+// CHECK-SAME:       polynomialModulus=#polynomial.polynomial<-1 + x**1024>>>
+#my_poly_4 = #polynomial.polynomial<-1 + x**1024>
+#ring4 = #polynomial.ring<coefficientType=i32, coefficientModulus=2837465, polynomialModulus=#my_poly_4>
+!ty4 = !polynomial.polynomial<#ring4>
+func.func @test_negative_leading_1(%0: !ty4) -> !ty4 {
+  return %0 : !ty4
+}
+
diff --git a/mlir/test/Dialect/SparseTensor/fuse_sparse_convert_into_producer.mlir b/mlir/test/Dialect/SparseTensor/fuse_sparse_convert_into_producer.mlir
new file mode 100644
index 000000000000..efa92e565ba5
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/fuse_sparse_convert_into_producer.mlir
@@ -0,0 +1,78 @@
+// RUN: mlir-opt %s --pre-sparsification-rewrite --sparse-reinterpret-map  | FileCheck %s --check-prefix=CHECK-FOLD
+// RUN: mlir-opt %s --pre-sparsification-rewrite --sparse-reinterpret-map --sparsification | FileCheck %s
+
+#trait = {
+  indexing_maps = [
+      affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
+      affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
+      affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
+      affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+  ],
+  iterator_types = ["parallel", "parallel", "parallel", "parallel"]
+}
+
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+#COO = #sparse_tensor.encoding<{map = (d0, d1, d2) -> (d0 : compressed(nonunique), d1 : singleton(nonunique, soa), d2 : singleton(soa))}>
+#CCCD = #sparse_tensor.encoding<{ map = (d0, d1, d2, d3) -> (d0 : compressed, d1 : compressed, d2 : compressed, d3 : dense) }>
+
+// CHECK-LABEL:   func.func @fold_convert(
+// CHECK:           scf.for
+// CHECK:             scf.for
+// CHECK:               scf.for
+// CHECK:                 scf.if
+// CHECK-NEXT:               tensor.insert
+// CHECK-NEXT:               scf.yield
+// CHECK-NEXT:             else
+// CHECK-NEXT:               scf.yield
+// CHECK:                 scf.yield
+// CHECK:               scf.yield
+// CHECK:             scf.yield
+// CHECK:           sparse_tensor.load
+
+// CHECK-FOLD-LABEL:   func.func @fold_convert(
+// CHECK-FOLD-NOT:     sparse_tensor.convert
+func.func @fold_convert(%arg0: tensor<128x32x32x1xf32>, %arg1: tensor<128x32x32x1xf32>, %arg2: tensor<128x32x32x1xf32>) -> tensor<128x32x32x1xf32, #CCCD> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %cst_0 = arith.constant 1.000000e+00 : f32
+  %cst_1 = arith.constant 1.000000e+00 : f32
+  %0 = tensor.empty() : tensor<128x32x32x1xf32>
+  %1 = linalg.generic #trait
+  ins(%arg0, %arg1, %arg2 : tensor<128x32x32x1xf32>, tensor<128x32x32x1xf32>, tensor<128x32x32x1xf32>)
+  outs(%0 : tensor<128x32x32x1xf32>) {
+    ^bb0(%in: f32, %in_2: f32, %in_3: f32, %out: f32):
+      %3 = arith.subf %cst_0, %in_2 : f32
+      %4 = arith.mulf %in, %3 : f32
+      %5 = arith.mulf %4, %cst_1 : f32
+      %6 = arith.addf %5, %in_3 : f32
+      %7 = arith.subf %6, %cst_0 : f32
+      %8 = arith.cmpf uge, %7, %cst : f32
+      %9 = arith.uitofp %8 : i1 to f32
+      linalg.yield %9 : f32
+    } -> tensor<128x32x32x1xf32>
+  %2 = sparse_tensor.convert %1 : tensor<128x32x32x1xf32> to tensor<128x32x32x1xf32, #CCCD>
+  return %2 : tensor<128x32x32x1xf32, #CCCD>
+}
+
+
+// FIXME: The following kernel is not sparsifiable because `arith.select`
+// operations is not handled by the sparse compiler at the moment.
+//
+// CHECK-FOLD-LABEL:   func.func @fold_cast(
+// CHECK-FOLD-NOT:     sparse_tensor.convert
+func.func @fold_cast(%0: tensor<10x20x30xf64, #COO>) -> tensor<10x20x30xf64, #COO> {
+  %cst = arith.constant 0.000000e+00 : f64
+  %1 = tensor.empty() : tensor<10x20x30xf64>
+  %2 = linalg.generic { indexing_maps = [#map, #map],
+                        iterator_types = ["parallel", "parallel", "parallel"]
+                      }
+  ins (%0 : tensor<10x20x30xf64, #COO>)
+  outs(%1 : tensor<10x20x30xf64>) {
+      ^bb0(%in: f64, %out: f64):
+        %4 = arith.cmpf ugt, %in, %cst : f64
+        %5 = arith.select %4, %in, %cst : f64
+        linalg.yield %5 : f64
+  } -> tensor<10x20x30xf64>
+  %cast = tensor.cast %2 : tensor<10x20x30xf64> to tensor<10x20x30xf64, #COO>
+  return %cast : tensor<10x20x30xf64, #COO>
+}
diff --git a/mlir/test/Dialect/SparseTensor/no_fold_into_consumer.mlir b/mlir/test/Dialect/SparseTensor/no_fold_into_consumer.mlir
deleted file mode 100644
index bbc7f397e793..000000000000
--- a/mlir/test/Dialect/SparseTensor/no_fold_into_consumer.mlir
+++ /dev/null
@@ -1,47 +0,0 @@
-// RUN: mlir-opt %s --canonicalize --pre-sparsification-rewrite | FileCheck %s
-
-#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-
-#sparse = #sparse_tensor.encoding<{
-  map = (d0, d1, d2) ->
-          (d0 : compressed(nonunique),
-	   d1 : singleton(nonunique, soa),
-	   d2 : singleton(soa)),
-  posWidth = 64,
-  crdWidth = 64
-}>
-
-
-module {
-  //
-  // This IR should not end up in an infinite loop trying to fold
-  // the linalg producer into the tensor cast consumer (even though
-  // static sizes can fold, the different encodings cannot). The
-  // cast was sloppy to begin with (but it has been observed by
-  // external sources) and can be easily repaired by the sparsifier.
-  //
-  // CHECK-LABEL: func @avoid_fold
-  // CHECK:       arith.constant
-  // CHECK:       tensor.empty()
-  // CHECK:       linalg.generic
-  // CHECK:       sparse_tensor.convert
-  // CHECK:       return
-  //
-  func.func @avoid_fold(%0: tensor<10x20x30xf64, #sparse>) -> tensor<10x20x30xf64, #sparse> {
-    %1 = tensor.empty() : tensor<10x20x30xf64>
-    %2 = linalg.generic { indexing_maps = [#map, #map],
-                          iterator_types = ["parallel", "parallel", "parallel"]
-                        }
-    ins (%0 : tensor<10x20x30xf64, #sparse>)
-    outs(%1 : tensor<10x20x30xf64>) {
-        ^bb0(%in: f64, %out: f64):
-          %cst = arith.constant 0.000000e+00 : f64
-          %4 = arith.cmpf ugt, %in, %cst : f64
-          %5 = arith.select %4, %in, %cst : f64
-          linalg.yield %5 : f64
-    } -> tensor<10x20x30xf64>
-    %cast = tensor.cast %2 : tensor<10x20x30xf64> to tensor<10x20x30xf64, #sparse>
-    return %cast : tensor<10x20x30xf64, #sparse>
-  }
-}
-
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index 751c57eacd7a..9a4dd2f3b5cc 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -2431,6 +2431,15 @@ func.func @reshape_nofold_2d(%arg0 : tensor<?x?xi32>) -> tensor<?x?xi32> {
   return %reshape : tensor<?x?xi32>
 }
 
+// -----
+
+// CHECK-LABEL: @reshape_nofold_2d_ins
+func.func @reshape_nofold_2d_ins(%arg0 : tensor<?x?xi32>, %arg1: index, %arg2: index) -> tensor<?x?xi32> {
+  %ds = tensor.from_elements %arg1, %arg2 : tensor<2xindex>
+  // CHECK: tensor.reshape
+  %reshape = tensor.reshape %arg0(%ds) : (tensor<?x?xi32>, tensor<2xindex>) -> tensor<?x?xi32>
+  return %reshape : tensor<?x?xi32>
+}
 
 // -----
 
diff --git a/mlir/unittests/Tools/lsp-server-support/Transport.cpp b/mlir/unittests/Tools/lsp-server-support/Transport.cpp
index b46f02bc4b19..a086964cd366 100644
--- a/mlir/unittests/Tools/lsp-server-support/Transport.cpp
+++ b/mlir/unittests/Tools/lsp-server-support/Transport.cpp
@@ -31,7 +31,7 @@ TEST(TransportTest, SendReply) {
 }
 
 class TransportInputTest : public Test {
-  std::optional<llvm::sys::fs::TempFile> inputTempFile;
+  llvm::SmallVector<char> inputPath;
   std::FILE *in = nullptr;
   std::string output = "";
   llvm::raw_string_ostream os;
@@ -42,25 +42,31 @@ protected:
   TransportInputTest() : os(output) {}
 
   void SetUp() override {
-    auto tempOr = llvm::sys::fs::TempFile::create("lsp-unittest-%%%%%%.json");
-    ASSERT_TRUE((bool)tempOr);
-    llvm::sys::fs::TempFile t = std::move(*tempOr);
-    inputTempFile = std::move(t);
+    std::error_code ec =
+        llvm::sys::fs::createTemporaryFile("lsp-unittest", "json", inputPath);
+    ASSERT_FALSE(ec) << "Could not create temporary file: " << ec.message();
 
-    in = std::fopen(inputTempFile->TmpName.c_str(), "r");
+    in = std::fopen(inputPath.data(), "r");
+    ASSERT_TRUE(in) << "Could not open temporary file: "
+                    << std::strerror(errno);
     transport.emplace(in, os, JSONStreamStyle::Delimited);
     messageHandler.emplace(*transport);
   }
 
   void TearDown() override {
-    EXPECT_FALSE(inputTempFile->discard());
-    EXPECT_EQ(std::fclose(in), 0);
+    EXPECT_EQ(std::fclose(in), 0)
+        << "Could not close temporary file FD: " << std::strerror(errno);
+    std::error_code ec =
+        llvm::sys::fs::remove(inputPath, /*IgnoreNonExisting=*/false);
+    EXPECT_FALSE(ec) << "Could not remove temporary file '" << inputPath.data()
+                     << "': " << ec.message();
   }
 
   void writeInput(StringRef buffer) {
     std::error_code ec;
-    llvm::raw_fd_ostream os(inputTempFile->TmpName, ec);
-    ASSERT_FALSE(ec);
+    llvm::raw_fd_ostream os(inputPath.data(), ec);
+    ASSERT_FALSE(ec) << "Could not write to '" << inputPath.data()
+                     << "': " << ec.message();
     os << buffer;
     os.close();
   }
diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index abc8baa0805f..a416ac29873f 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -151,6 +151,25 @@ if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
   message(FATAL_ERROR "Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
 endif()
 
+set(LIBOMPTARGET_ALL_PLUGIN_TARGETS amdgpu cuda host)
+set(LIBOMPTARGET_PLUGINS_TO_BUILD "all" CACHE STRING
+    "Semicolon-separated list of plugins to use: cuda, amdgpu, host or \"all\".")
+
+if(LIBOMPTARGET_PLUGINS_TO_BUILD STREQUAL "all")
+  set(LIBOMPTARGET_PLUGINS_TO_BUILD ${LIBOMPTARGET_ALL_PLUGIN_TARGETS})
+endif()
+
+set(LIBOMPTARGET_ENUM_PLUGIN_TARGETS "")
+foreach(plugin IN LISTS LIBOMPTARGET_PLUGINS_TO_BUILD)
+  set(LIBOMPTARGET_ENUM_PLUGIN_TARGETS
+      "${LIBOMPTARGET_ENUM_PLUGIN_TARGETS}PLUGIN_TARGET(${plugin})\n")
+endforeach()
+string(STRIP ${LIBOMPTARGET_ENUM_PLUGIN_TARGETS} LIBOMPTARGET_ENUM_PLUGIN_TARGETS)
+configure_file(
+  ${CMAKE_CURRENT_SOURCE_DIR}/include/Shared/Targets.def.in
+  ${CMAKE_CURRENT_BINARY_DIR}/include/Shared/Targets.def
+)
+
 include_directories(${LIBOMPTARGET_LLVM_INCLUDE_DIRS})
 
 # This is a list of all the targets that are supported/tested right now.
@@ -288,6 +307,7 @@ set(LIBOMPTARGET_GPU_LIBC_SUPPORT ${LLVM_LIBC_GPU_BUILD} CACHE BOOL
 pythonize_bool(LIBOMPTARGET_GPU_LIBC_SUPPORT)
 
 set(LIBOMPTARGET_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
+set(LIBOMPTARGET_BINARY_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include)
 message(STATUS "OpenMP tools dir in libomptarget: ${LIBOMP_OMP_TOOLS_INCLUDE_DIR}")
 if(LIBOMP_OMP_TOOLS_INCLUDE_DIR)
   include_directories(${LIBOMP_OMP_TOOLS_INCLUDE_DIR})
diff --git a/offload/include/Shared/Targets.def.in b/offload/include/Shared/Targets.def.in
new file mode 100644
index 000000000000..f34b523b4542
--- /dev/null
+++ b/offload/include/Shared/Targets.def.in
@@ -0,0 +1,20 @@
+//===-- Shared/Targets.def - Target plugin enumerator -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Enumerates over all of the supported target plugins that are available to
+// the offloading library.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PLUGIN_TARGET
+#  error Please define the macro PLUGIN_TARGET(TargetName)
+#endif
+
+@LIBOMPTARGET_ENUM_PLUGIN_TARGETS@
+
+#undef PLUGIN_TARGET
diff --git a/offload/plugins-nextgen/CMakeLists.txt b/offload/plugins-nextgen/CMakeLists.txt
index dbd82ac94517..df625e97c7eb 100644
--- a/offload/plugins-nextgen/CMakeLists.txt
+++ b/offload/plugins-nextgen/CMakeLists.txt
@@ -69,9 +69,12 @@ function(add_target_library target_name lib_name)
   set_target_properties(${target_name} PROPERTIES CXX_VISIBILITY_PRESET protected)
 endfunction()
 
-add_subdirectory(amdgpu)
-add_subdirectory(cuda)
-add_subdirectory(host)
+foreach(plugin IN LISTS LIBOMPTARGET_PLUGINS_TO_BUILD)
+  if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${plugin})
+    message(FATAL_ERROR "Unknown plugin target '${plugin}'")
+  endif()
+  add_subdirectory(${plugin})
+endforeach()
 
 # Make sure the parent scope can see the plugins that will be created.
 set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
diff --git a/offload/plugins-nextgen/common/CMakeLists.txt b/offload/plugins-nextgen/common/CMakeLists.txt
index a7350e662a7c..acf0af63f050 100644
--- a/offload/plugins-nextgen/common/CMakeLists.txt
+++ b/offload/plugins-nextgen/common/CMakeLists.txt
@@ -62,6 +62,7 @@ target_link_options(PluginCommon PUBLIC ${offload_link_flags})
 target_include_directories(PluginCommon PUBLIC 
   ${CMAKE_CURRENT_SOURCE_DIR}/include
   ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
+  ${LIBOMPTARGET_BINARY_INCLUDE_DIR}
   ${LIBOMPTARGET_INCLUDE_DIR}
 )
 
diff --git a/offload/plugins-nextgen/host/CMakeLists.txt b/offload/plugins-nextgen/host/CMakeLists.txt
index 7da18ee278d4..6407f72e8db0 100644
--- a/offload/plugins-nextgen/host/CMakeLists.txt
+++ b/offload/plugins-nextgen/host/CMakeLists.txt
@@ -14,36 +14,36 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le$")
 endif()
 
 # Create the library and add the default arguments.
-add_target_library(omptarget.rtl.${machine} ${machine})
+add_target_library(omptarget.rtl.host ${machine})
 
-target_sources(omptarget.rtl.${machine} PRIVATE src/rtl.cpp)
+target_sources(omptarget.rtl.host PRIVATE src/rtl.cpp)
 
 if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
   libomptarget_say("Building ${machine} plugin linked with libffi")
   if(FFI_STATIC_LIBRARIES)
-    target_link_libraries(omptarget.rtl.${machine} PRIVATE FFI::ffi_static)
+    target_link_libraries(omptarget.rtl.host PRIVATE FFI::ffi_static)
   else()
-    target_link_libraries(omptarget.rtl.${machine} PRIVATE FFI::ffi)
+    target_link_libraries(omptarget.rtl.host PRIVATE FFI::ffi)
   endif()
 else()
   libomptarget_say("Building ${machine} plugin for dlopened libffi")
-  target_sources(omptarget.rtl.${machine} PRIVATE dynamic_ffi/ffi.cpp)
-  target_include_directories(omptarget.rtl.${machine} PRIVATE dynamic_ffi)
+  target_sources(omptarget.rtl.host PRIVATE dynamic_ffi/ffi.cpp)
+  target_include_directories(omptarget.rtl.host PRIVATE dynamic_ffi)
 endif()
 
 # Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.${machine}
+install(TARGETS omptarget.rtl.host
         LIBRARY DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
-set_target_properties(omptarget.rtl.${machine} PROPERTIES
+set_target_properties(omptarget.rtl.host PROPERTIES
   INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
   POSITION_INDEPENDENT_CODE ON
   CXX_VISIBILITY_PRESET protected)
 
-target_include_directories(omptarget.rtl.${machine} PRIVATE
+target_include_directories(omptarget.rtl.host PRIVATE
                            ${LIBOMPTARGET_INCLUDE_DIR})
 
 if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
-  list(APPEND LIBOMPTARGET_TESTED_PLUGINS omptarget.rtl.${machine})
+  list(APPEND LIBOMPTARGET_TESTED_PLUGINS omptarget.rtl.host)
   set(LIBOMPTARGET_TESTED_PLUGINS
       "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
 else()
@@ -52,36 +52,36 @@ endif()
 
 # Define the target specific triples and ELF machine values.
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le$")
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE TARGET_ELF_ID=EM_PPC64)
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE
+  target_compile_definitions(omptarget.rtl.host PRIVATE TARGET_ELF_ID=EM_PPC64)
+  target_compile_definitions(omptarget.rtl.host PRIVATE
       LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="powerpc64le-ibm-linux-gnu")
   list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
        "powerpc64le-ibm-linux-gnu" "powerpc64le-ibm-linux-gnu-LTO")
   set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64$")
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE TARGET_ELF_ID=EM_PPC64)
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE
+  target_compile_definitions(omptarget.rtl.host PRIVATE TARGET_ELF_ID=EM_PPC64)
+  target_compile_definitions(omptarget.rtl.host PRIVATE
       LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="powerpc64-ibm-linux-gnu")
   list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
        "powerpc64-ibm-linux-gnu" "powerpc64-ibm-linux-gnu-LTO")
   set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64$")
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE TARGET_ELF_ID=EM_X86_64)
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE
+  target_compile_definitions(omptarget.rtl.host PRIVATE TARGET_ELF_ID=EM_X86_64)
+  target_compile_definitions(omptarget.rtl.host PRIVATE
       LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="x86_64-pc-linux-gnu")
   list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
        "x86_64-pc-linux-gnu" "x86_64-pc-linux-gnu-LTO")
   set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64$")
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE TARGET_ELF_ID=EM_AARCH64)
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE
+  target_compile_definitions(omptarget.rtl.host PRIVATE TARGET_ELF_ID=EM_AARCH64)
+  target_compile_definitions(omptarget.rtl.host PRIVATE
       LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="aarch64-unknown-linux-gnu")
   list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
        "aarch64-unknown-linux-gnu" "aarch64-unknown-linux-gnu-LTO")
   set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x$")
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE TARGET_ELF_ID=EM_S390)
-  target_compile_definitions(omptarget.rtl.${machine} PRIVATE
+  target_compile_definitions(omptarget.rtl.host PRIVATE TARGET_ELF_ID=EM_S390)
+  target_compile_definitions(omptarget.rtl.host PRIVATE
       LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="s390x-ibm-linux-gnu")
   list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
        "s390x-ibm-linux-gnu" "s390x-ibm-linux-gnu-LTO")
diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt
index fb1ad3d7ae70..eda5a85ff1ab 100644
--- a/offload/src/CMakeLists.txt
+++ b/offload/src/CMakeLists.txt
@@ -37,6 +37,7 @@ add_llvm_library(omptarget
 
   ADDITIONAL_HEADER_DIRS
   ${LIBOMPTARGET_INCLUDE_DIR}
+  ${LIBOMPTARGET_BINARY_INCLUDE_DIR}
 
   LINK_COMPONENTS
   Support
@@ -49,7 +50,9 @@ add_llvm_library(omptarget
   NO_INSTALL_RPATH
   BUILDTREE_ONLY
 )
-target_include_directories(omptarget PRIVATE ${LIBOMPTARGET_INCLUDE_DIR})
+target_include_directories(omptarget PRIVATE 
+  ${LIBOMPTARGET_INCLUDE_DIR} ${LIBOMPTARGET_BINARY_INCLUDE_DIR}
+)
 
 if (LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
   target_link_libraries(omptarget PRIVATE
@@ -65,23 +68,6 @@ target_compile_definitions(omptarget PRIVATE
 target_compile_options(omptarget PUBLIC ${offload_compile_flags})
 target_link_options(omptarget PUBLIC ${offload_link_flags})
 
-macro(check_plugin_target target)
-if (TARGET omptarget.rtl.${target})
-	list(APPEND LIBOMPTARGET_PLUGINS_TO_LOAD ${target})
-endif()
-endmacro()
-
-set(LIBOMPTARGET_PLUGINS_TO_LOAD "" CACHE STRING
-  "Comma separated list of plugin names to look for at runtime")
-if (NOT LIBOMPTARGET_PLUGINS_TO_LOAD)
-	check_plugin_target(ppc64)
-	check_plugin_target(x86_64)
-	check_plugin_target(cuda)
-	check_plugin_target(aarch64)
-	check_plugin_target(amdgpu)
-	check_plugin_target(s390x)
-endif()
-
 list(TRANSFORM LIBOMPTARGET_PLUGINS_TO_LOAD PREPEND "\"libomptarget.rtl.")
 list(TRANSFORM LIBOMPTARGET_PLUGINS_TO_LOAD APPEND "\"")
 list(JOIN LIBOMPTARGET_PLUGINS_TO_LOAD "," ENABLED_OFFLOAD_PLUGINS)
diff --git a/offload/src/PluginManager.cpp b/offload/src/PluginManager.cpp
index 792cae3e3dd5..dbb556c179e5 100644
--- a/offload/src/PluginManager.cpp
+++ b/offload/src/PluginManager.cpp
@@ -23,9 +23,6 @@ using namespace llvm::sys;
 
 PluginManager *PM = nullptr;
 
-// List of all plugins that can support offloading.
-static const char *RTLNames[] = {ENABLED_OFFLOAD_PLUGINS};
-
 Expected<std::unique_ptr<PluginAdaptorTy>>
 PluginAdaptorTy::create(const std::string &Name) {
   DP("Attempting to load library '%s'...\n", Name.c_str());
@@ -95,17 +92,19 @@ void PluginManager::init() {
 
   // Attempt to open all the plugins and, if they exist, check if the interface
   // is correct and if they are supporting any devices.
-  for (const char *Name : RTLNames) {
-    auto PluginAdaptorOrErr =
-        PluginAdaptorTy::create(std::string(Name) + ".so");
-    if (!PluginAdaptorOrErr) {
-      [[maybe_unused]] std::string InfoMsg =
-          toString(PluginAdaptorOrErr.takeError());
-      DP("%s", InfoMsg.c_str());
-    } else {
-      PluginAdaptors.push_back(std::move(*PluginAdaptorOrErr));
-    }
-  }
+#define PLUGIN_TARGET(Name)                                                    \
+  do {                                                                         \
+    auto PluginAdaptorOrErr =                                                  \
+        PluginAdaptorTy::create("libomptarget.rtl." #Name ".so");              \
+    if (!PluginAdaptorOrErr) {                                                 \
+      [[maybe_unused]] std::string InfoMsg =                                   \
+          toString(PluginAdaptorOrErr.takeError());                            \
+      DP("%s", InfoMsg.c_str());                                               \
+    } else {                                                                   \
+      PluginAdaptors.push_back(std::move(*PluginAdaptorOrErr));                \
+    }                                                                          \
+  } while (false);
+#include "Shared/Targets.def"
 
   DP("RTLs loaded!\n");
 }
diff --git a/offload/test/unified_shared_memory/api.c b/offload/test/unified_shared_memory/api.c
index c7ab055abb51..b938971b4b03 100644
--- a/offload/test/unified_shared_memory/api.c
+++ b/offload/test/unified_shared_memory/api.c
@@ -9,11 +9,6 @@
 #include <omp.h>
 #include <stdio.h>
 
-// ---------------------------------------------------------------------------
-// Various definitions copied from OpenMP RTL
-
-extern void __tgt_register_requires(int64_t);
-
 // End of definitions copied from OpenMP RTL.
 // ---------------------------------------------------------------------------
 
@@ -32,10 +27,6 @@ void init(int A[], int B[], int C[]) {
 int main(int argc, char *argv[]) {
   const int device = omp_get_default_device();
 
-  // Manual registration of requires flags for Clang versions
-  // that do not support requires.
-  __tgt_register_requires(8);
-
   // CHECK: Initial device: [[INITIAL_DEVICE:[0-9]+]]
   printf("Initial device: %d\n", omp_get_initial_device());
   // CHECK: Num devices: [[INITIAL_DEVICE]]
diff --git a/offload/test/unified_shared_memory/close_manual.c b/offload/test/unified_shared_memory/close_manual.c
index 9985e822c05d..c588cb1c403a 100644
--- a/offload/test/unified_shared_memory/close_manual.c
+++ b/offload/test/unified_shared_memory/close_manual.c
@@ -8,8 +8,6 @@
 // ---------------------------------------------------------------------------
 // Various definitions copied from OpenMP RTL
 
-extern void __tgt_register_requires(int64_t);
-
 extern void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
                                     void **args_base, void **args,
                                     int64_t *arg_sizes, int64_t *arg_types);
@@ -30,10 +28,6 @@ int main(int argc, char *argv[]) {
   void *host_alloc = 0, *device_alloc = 0;
   int *a = (int *)malloc(N * sizeof(int));
 
-  // Manual registration of requires flags for Clang versions
-  // that do not support requires.
-  __tgt_register_requires(8);
-
   // Init
   for (int i = 0; i < N; ++i) {
     a[i] = 10;
diff --git a/offload/test/unified_shared_memory/shared_update.c b/offload/test/unified_shared_memory/shared_update.c
index 65db9e4f6bdc..f8eb11d56a6c 100644
--- a/offload/test/unified_shared_memory/shared_update.c
+++ b/offload/test/unified_shared_memory/shared_update.c
@@ -11,11 +11,6 @@
 #include <omp.h>
 #include <stdio.h>
 
-// ---------------------------------------------------------------------------
-// Various definitions copied from OpenMP RTL
-
-extern void __tgt_register_requires(int64_t);
-
 // End of definitions copied from OpenMP RTL.
 // ---------------------------------------------------------------------------
 
@@ -30,10 +25,6 @@ int main(int argc, char *argv[]) {
   int *alloc = (int *)malloc(N * sizeof(int));
   int data[N];
 
-  // Manual registration of requires flags for Clang versions
-  // that do not support requires.
-  __tgt_register_requires(8);
-
   for (int i = 0; i < N; ++i) {
     alloc[i] = 10;
     data[i] = 1;
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index 378e5aa296c4..f34e55555545 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -3038,15 +3038,7 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
     KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "system info for topology");
 
   // Get the number of SMT threads per core.
-  int retval =
-      lpar_get_info(LPAR_INFO_FORMAT1, &cpuinfo, sizeof(lpar_info_format1_t));
-  if (!retval)
-    smt_threads = cpuinfo.smt_threads;
-  else {
-    CLEANUP_THREAD_INFO;
-    *msg_id = kmp_i18n_str_UnknownTopology;
-    return false;
-  }
+  smt_threads = syssmt(GET_NUMBER_SMT_SETS, 0, 0, NULL);
 
   // Allocate a resource set containing available system resourses.
   rsethandle_t sys_rset = rs_alloc(RS_SYSTEM);
diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h
index 8e9e7667eb90..3dc2c84d53f7 100644
--- a/openmp/runtime/src/kmp_affinity.h
+++ b/openmp/runtime/src/kmp_affinity.h
@@ -322,6 +322,8 @@ public:
 #include <sys/dr.h>
 #include <sys/rset.h>
 #define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
+#define GET_NUMBER_SMT_SETS 0x0004
+extern "C" int syssmt(int flags, int, int, int *);
 #endif
 class KMPNativeAffinity : public KMPAffinity {
   class Mask : public KMPAffinity::Mask {
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 3223eb92d869..f45f057b63c2 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -3111,12 +3111,26 @@ cc_library(
 )
 
 cc_library(
-    name = "llvm-mca-headers",
+    name = "MCAApplication",
+    srcs = glob([
+        "tools/llvm-mca/Views/*.cpp",
+    ]) + [
+        mca_source
+        for mca_source in glob(["tools/llvm-mca/*.cpp"])
+        if mca_source != "tools/llvm-mca/llvm-mca.cpp"
+    ],
     hdrs = glob([
         "tools/llvm-mca/*.h",
         "tools/llvm-mca/Views/*.h",
     ]),
     strip_include_prefix = "tools/llvm-mca",
+    deps = [
+        ":MC",
+        ":MCA",
+        ":MCParser",
+        ":Support",
+        ":TargetParser",
+    ],
 )
 
 cc_library(
@@ -4034,12 +4048,9 @@ cc_binary(
 
 cc_binary(
     name = "llvm-mca",
-    srcs = glob([
-        "tools/llvm-mca/*.cpp",
-        "tools/llvm-mca/*.h",
-        "tools/llvm-mca/Views/*.cpp",
-        "tools/llvm-mca/Views/*.h",
-    ]),
+    srcs =[
+        "tools/llvm-mca/llvm-mca.cpp",
+    ],
     copts = llvm_copts,
     stamp = 0,
     deps = [
@@ -4049,10 +4060,10 @@ cc_binary(
         ":AllTargetsMCAs",
         ":MC",
         ":MCA",
+        ":MCAApplication",
         ":MCParser",
         ":Support",
         ":TargetParser",
-        ":llvm-mca-headers",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
index dd42f84d16dc..9be26ab551b0 100644
--- a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
@@ -811,3 +811,29 @@ cc_test(
         "//third-party/unittest:gtest_main",
     ],
 )
+
+cc_test(
+    name = "llvm_mca_tests",
+    size = "small",
+    srcs = glob(
+        [
+            "tools/llvm-mca/*.cpp",
+            "tools/llvm-mca/*.h",
+            "tools/llvm-mca/X86/*.cpp",
+            "tools/llvm-mca/X86/*.h",
+        ],
+        allow_empty = False,
+    ),
+    includes = ["tools/llvm-mca"],
+    deps = [
+        "//llvm:MC",
+        "//llvm:MCA",
+        "//llvm:MCAApplication",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+        "//llvm:X86CodeGen",
+        "//llvm:X86UtilsAndDesc",
+        "//third-party/unittest:gtest",
+        "//third-party/unittest:gtest_main",
+    ],
+)
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 6a6f8fc13410..52c874c344c5 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -6697,6 +6697,7 @@ cc_library(
     ]),
     includes = ["include"],
     deps = [
+        ":ArithDialect",
         ":IR",
         ":InferTypeOpInterface",
         ":PolynomialAttributesIncGen",
author	Paul Kirth <paulkirth@google.com>	2024-04-29 18:27:20 +0000
committer	Paul Kirth <paulkirth@google.com>	2024-04-29 18:27:20 +0000
commit	f53ab85214a1facd092ce97b03f83c43a959f979 (patch)
tree	7d38bbfc50bc9fdd6d488c1d719b280627b65fb5
parent	4bdaf60f8fb08b0088b565c6818f967ac75a5400 (diff)
parent	9c3f5fe88f19820360981d0798392799e1924cb7 (diff)